上QQ阅读APP看书,第一时间看更新
3.2.11 写索引集成到爬虫
爬虫把抓取的信息写入索引:
public class IndexDao { private IndexWriter indexWriter; public IndexDao(){ try { Directory directory = FSDirectory.open(new File("d:/lietu/index")); Analyzer analyzer = new StandardAnalyzer(); indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); } catch (IOException e) { e.printStackTrace(); } } public void save(GoodsInfo goodsInfo){ Document doc = goodsInfo2Document(goodsInfo); try{ indexWriter.addDocument(doc); }catch(Exception e){ e.printStackTrace(); } } public void close(){ try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } public Document goodsInfo2Document(GoodsInfo ti) { Document doc = new Document(); Field f = new Field("url", ti.getGoodsNameURL(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); f = new Field("title", ti.getGoodsName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); if (ti.getGoodsDescription() ! = null) { f = new Field("body", ti.getGoodsDescription(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } f = new Field("date", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); f = new Field("priceInt", String.valueOf(ti.getPriceInteger()), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); if (ti.getMoneyUnit() ! = null) { f = new Field("moneyUnit", ti.getMoneyUnit(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } try { URL website = new URL(ti.getGoodsNameURL().toString()); f = new Field("fromwebsite", website.getHost(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } catch (MalformedURLException e1) { System.out.println("error url =" + ti.getGoodsNameURL().toString()); e1.printStackTrace(); } // 分类 f = new Field("category", ti.getGoodsType(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); // img if (ti.getImage() ! = null) { f = new Field("img", ti.getImage(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 制造厂名称 if (ti.getMfrName() ! = null) { f = new Field("brand", ti.getMfrName(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 商品型号 序列号 if (ti.getMfrNumber() ! = null) { f = new Field("type", ti.getMfrNumber(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } // 价格 if (ti.getGoodsPrice() ! = null) { f = new Field("price", ti.getGoodsPrice(), Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); doc.add(f); } return doc; } }