自己动手写分布式搜索引擎
上QQ阅读APP看书,第一时间看更新

3.2.11 写索引集成到爬虫

爬虫把抓取的信息写入索引:

        public class IndexDao {
            private IndexWriter indexWriter;


            public IndexDao(){
                try {
                    Directory directory = FSDirectory.open(new
        File("d:/lietu/index"));
                    Analyzer analyzer = new StandardAnalyzer();
                    indexWriter = new IndexWriter(directory, analyzer,
                                MaxFieldLength.LIMITED);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }


            public void save(GoodsInfo goodsInfo){
                Document doc = goodsInfo2Document(goodsInfo);


                try{
                    indexWriter.addDocument(doc);
                }catch(Exception e){
                    e.printStackTrace();
                }
            }


            public void close(){
                try {
                    indexWriter.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }


            public Document goodsInfo2Document(GoodsInfo ti) {
                Document doc = new Document();
                Field f = new Field("url", ti.getGoodsNameURL(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED, Field.TermVector.NO);
                doc.add(f);


                f = new Field("title", ti.getGoodsName(), Field.Store.YES,
                              Field.Index.ANALYZED,
                        Field.TermVector.WITH_POSITIONS_OFFSETS);
                doc.add(f);
    if (ti.getGoodsDescription() ! = null) {
        f = new Field("body", ti.getGoodsDescription(), Field.Store.YES,
                Field.Index.NOT_ANALYZED, Field.TermVector.NO);
        doc.add(f);
    }


    f = new Field("date", DateTools.dateToString(new Date(),
            DateTools.Resolution.DAY), Field.Store.YES,
            Field.Index.NOT_ANALYZED, Field.TermVector.NO);
    doc.add(f);


    f = new Field("priceInt", String.valueOf(ti.getPriceInteger()),
            Field.Store.YES, Field.Index.ANALYZED,
            Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);


    if (ti.getMoneyUnit() ! = null) {
        f = new Field("moneyUnit", ti.getMoneyUnit(), Field.Store.YES,
                Field.Index.NOT_ANALYZED, Field.TermVector.NO);
        doc.add(f);
    }


    try {
        URL website = new URL(ti.getGoodsNameURL().toString());
        f = new Field("fromwebsite", website.getHost(), Field.Store.YES,
                Field.Index.NOT_ANALYZED, Field.TermVector.NO);
        doc.add(f);
    } catch (MalformedURLException e1) {
        System.out.println("error url =" + ti.getGoodsNameURL().toString());
        e1.printStackTrace();
    }


    // 分类
    f = new Field("category", ti.getGoodsType(), Field.Store.YES,
            Field.Index.NOT_ANALYZED, Field.TermVector.NO);
    doc.add(f);


    // img
    if (ti.getImage() ! = null) {
        f = new Field("img", ti.getImage(), Field.Store.YES,
                Field.Index.NOT_ANALYZED, Field.TermVector.NO);
        doc.add(f);
    }
    // 制造厂名称
    if (ti.getMfrName() ! = null) {
        f = new Field("brand", ti.getMfrName(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED, Field.TermVector.NO);
                doc.add(f);
            }
            // 商品型号 序列号
            if (ti.getMfrNumber() ! = null) {
                f = new Field("type", ti.getMfrNumber(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED, Field.TermVector.NO);
                doc.add(f);
            }


            // 价格
            if (ti.getGoodsPrice() ! = null) {
                f = new Field("price", ti.getGoodsPrice(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED, Field.TermVector.NO);
                doc.add(f);
            }
            return doc;
        }
    }