知识大全 如何使用Lucene对html文件进行索引

Posted 文件

篇首语:春衣少年当酒歌,起舞四顾以笑和。本文由小常识网(cha138.com)小编为大家整理,主要介绍了知识大全 如何使用Lucene对html文件进行索引相关的知识,希望对你有一定的参考价值。

如何使用Lucene对html文件进行索引  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!

  我修改了lucene的demo包的IndexHTML类 使其可以被其他Java类调用     IndexHTML类    import apache lucene analysis standard StandardAnalyzer;    import apache lucene document Document;    import apache lucene index IndexReader;    import apache lucene index IndexWriter;    import apache lucene index Term;    import apache lucene index TermEnum;    import java io File;import java util Date;    import java util Arrays;    //还需调用demo的其他类     import apache lucene demo;    /**    * Create file index for searching    * @author tyrone    *    */public class IndexHTML private String DocsPath=null;    /**    * the path for index file;    */ private String IndexFilePath=null;    /**    * true during deletion pass    */  private boolean deleting = false;    /**    * existing index    */  private IndexReader reader;    /**    * new index being built    */  private IndexWriter writer;    /**    * document id iterator    */  private TermEnum uidIter;    private void indexDocs(File file)throws Exception     if (file isDirectory())        // if a directory  String[] files = file list();    // list its files  Arrays sort(files);    // sort the files  for (int i = ; i < files length;    i++)  // recursively index them  this indexDocs(new File(file files[i]));     else if (file getPath() endsWith(l ) || // l files  file getPath() endsWith( ) || // files  file getPath() endsWith( txt )) // index txt files   if (this uidIter != null)   String uid = HTMLDocument uid(file);    // construct uid for doc    while (uidIter term() != null && uidIter term() field() == uid &&    uidIter term() text(pareTo(uid) < )     if (deleting)     // delete stale docs    System out println( deleting +    HTMLDocument uid url(uidIter term() text()));    reader delete(uidIter term());        uidIter next();        if (uidIter term() != null && uidIter term() field() == uid &&    uidIter term() text(pareTo(uid) == )     uidIter next();    // keep matching docs     else if (!deleting)     // add new docs    Document doc = HTMLDocument Document(file);    System out println( adding + doc get( url ));    writer addDocument(doc);         else // creating a new index    Document doc = HTMLDocument Document(file);    System out println( adding + doc get( url ));    writer addDocument(doc);    // add docs unconditionally         return;        /**    * Walk directory hierarchy in uid order while keeping uid iterator from    * existing index in sync  Mismatches indicate one of:    * (a) old documents to be deleted;    * (b) unchanged documents to be left alone;    * or (c) new documents to be indexed     */  private void indexDocs(File file String index boolean create)    throws Exception     if (!create)     // incrementally update    reader = IndexReader open(index);    // open existing index    uidIter = reader terms(new Term( uid ));    // init uid iterator    this indexDocs(file);    if (deleting)     // delete rest of stale docs    while (uidIter term() != null && uidIter term() field() == uid )     System out println( deleting +    HTMLDocument uid url(uidIter term() text()));    reader delete(uidIter term());    uidIter next();        deleting = false;        uidIter close();    // close uid iterator    reader close();    // close existing index     else    // don t have exisiting    this indexDocs(file);        /**    * if create=true create a new index else refresh old index     * @param create    */ public void run(boolean create)        try     String index = index ;    File root = null;    if (this IndexFilePath!=null)        // index file path    index = this IndexFilePath;        if (this DocsPath==null)    System out println( root directory is not set );    return;        root = new File(this DocsPath);    Date start = new Date();    /**    * not create then maintenance    */    if (!create)     // delete stale docs    this deleting = true;    this indexDocs(root index create);        writer = new IndexWriter(index new StandardAnalyzer() create);    writer maxFieldLength = ;    this indexDocs(root index create);    // add new docs    System out println( Optimizing index );    writer optimize();    writer close();    Date end = new Date();    System out print(end getTime() start getTime());    System out println( total milliseconds );     catch (Exception e)     System out println( caught a + e getClass() +     \\n with message: + e getMessage());        return;        /**    * @return Returns the IndexFilePath     */ public String getIndexFilePath()  return IndexFilePath;        /**    * @param IndexFilePath The IndexFilePath to set     */ public void setIndexFilePath(String property )  this IndexFilePath = property ;        /**    * @return Returns the DocsPath     */ public String getDocsPath()  return DocsPath;        /**    * @param DocsPath The DocsPath to set     */ public void setDocsPath(String property )  this DocsPath = property ;        /**    * test    * @param args    */ public static void main(String[] args) IndexHTML ih=new IndexHTML();    ih setDocsPath( D:\\\\MyProject\\\\colimas\\\\clms doc \\\\ );    ih setIndexFilePath( D:\\\\MyProject\\\\colimas\\\\index ); ih run(true);     运行后生成 个文件_ i cfs deletable segments    搜索文件类     /*    * Created on / /     *    * TODO To change the template for this generated file go to    * Window Preferences Java Code Style Code Templates    */package limas search query;    /** * @author tyrone * * TODO To change the template for this generated type ment go to    * Window Preferences Java Code Style Code Templates    */public class HitsHTMLDoc  private String Title;    priva cha138/Article/program/Java/ky/201311/28449

相关参考