知识大全如何使用Lucene对html文件进行索引

Posted 2022-07-18 文件

篇首语：春衣少年当酒歌，起舞四顾以笑和。本文由小常识网(cha138.com)小编为大家整理，主要介绍了知识大全如何使用Lucene对html文件进行索引相关的知识，希望对你有一定的参考价值。

如何使用Lucene对html文件进行索引 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容，让我们赶快一起来看一下吧！

　　我修改了lucene的demo包的IndexHTML类使其可以被其他Java类调用　　　　IndexHTML类　　　　import apache lucene analysis standard StandardAnalyzer;　　　　import apache lucene document Document;　　　　import apache lucene index IndexReader;　　　　import apache lucene index IndexWriter;　　　　import apache lucene index Term;　　　　import apache lucene index TermEnum;　　　　import java io File;import java util Date;　　　　import java util Arrays;　　　　//还需调用demo的其他类　　　　import apache lucene demo;　　　　/**　　　　* Create file index for searching　　　　* @author tyrone　　　　*　　　　*/public class IndexHTML private String DocsPath=null;　　　　/**　　　　* the path for index file;　　　　*/ private String IndexFilePath=null;　　　　/**　　　　* true during deletion pass　　　　*/　 private boolean deleting = false;　　　　/**　　　　* existing index　　　　*/　 private IndexReader reader;　　　　/**　　　　* new index being built　　　　*/　 private IndexWriter writer;　　　　/**　　　　* document id iterator　　　　*/　 private TermEnum uidIter;　　　　private void indexDocs(File file)throws Exception 　　　　if (file isDirectory())　　　　　　　　// if a directory　 String[] files = file list();　　　　// list its files　 Arrays sort(files);　　　　// sort the files　 for (int i = ; i < files length;　　　　i++)　 // recursively index them　　this indexDocs(new File(file files[i]));　　　　 else if (file getPath() endsWith(l ) || // l files　　file getPath() endsWith( ) || // files　　file getPath() endsWith( txt )) // index txt files　　　if (this uidIter != null) 　　String uid = HTMLDocument uid(file);　　　　// construct uid for doc　　　　while (uidIter term() != null && uidIter term() field() == uid &&　　　　uidIter term() text(pareTo(uid) < ) 　　　　if (deleting) 　　　　// delete stale docs　　　　System out println( deleting +　　　　HTMLDocument uid url(uidIter term() text()));　　　　reader delete(uidIter term());　　　　　　　　uidIter next();　　　　　　　　if (uidIter term() != null && uidIter term() field() == uid &&　　　　uidIter term() text(pareTo(uid) == ) 　　　　uidIter next();　　　　// keep matching docs　　　　 else if (!deleting) 　　　　// add new docs　　　　Document doc = HTMLDocument Document(file);　　　　System out println( adding + doc get( url ));　　　　writer addDocument(doc);　　　　　　　　 else // creating a new index　　　　Document doc = HTMLDocument Document(file);　　　　System out println( adding + doc get( url ));　　　　writer addDocument(doc);　　　　// add docs unconditionally　　　　　　　　　return;　　　　　　　　/**　　　　* Walk directory hierarchy in uid order while keeping uid iterator from　　　　* existing index in sync 　Mismatches indicate one of:　　　　* (a) old documents to be deleted;　　　　* (b) unchanged documents to be left alone;　　　　* or (c) new documents to be indexed 　　　　*/　 private void indexDocs(File file String index boolean create)　　　　throws Exception 　　　　if (!create) 　　　　// incrementally update　　　　reader = IndexReader open(index);　　　　// open existing index　　　　uidIter = reader terms(new Term( uid ));　　　　// init uid iterator　　　　this indexDocs(file);　　　　if (deleting) 　　　　// delete rest of stale docs　　　　while (uidIter term() != null && uidIter term() field() == uid ) 　　　　System out println( deleting +　　　　HTMLDocument uid url(uidIter term() text()));　　　　reader delete(uidIter term());　　　　uidIter next();　　　　　　　　deleting = false;　　　　　　　　uidIter close();　　　　// close uid iterator　　　　reader close();　　　　// close existing index　　　　 else　　　　// don t have exisiting　　　　this indexDocs(file);　　　　　　　　/**　　　　* if create=true create a new index else refresh old index 　　　　* @param create　　　　*/ public void run(boolean create)　　　　　　　　try 　　　　String index = index ;　　　　File root = null;　　　　if (this IndexFilePath!=null)　　　　　　　　// index file path　　　　index = this IndexFilePath;　　　　　　　　if (this DocsPath==null)　　　　System out println( root directory is not set );　　　　return;　　　　　　　　root = new File(this DocsPath);　　　　Date start = new Date();　　　　/**　　　　* not create then maintenance　　　　*/　　　　if (!create) 　　　　// delete stale docs　　　　this deleting = true;　　　　this indexDocs(root index create);　　　　　　　　writer = new IndexWriter(index new StandardAnalyzer() create);　　　　writer maxFieldLength = ;　　　　this indexDocs(root index create);　　　　// add new docs　　　　System out println( Optimizing index );　　　　writer optimize();　　　　writer close();　　　　Date end = new Date();　　　　System out print(end getTime() start getTime());　　　　System out println( total milliseconds );　　　　 catch (Exception e) 　　　　System out println( caught a + e getClass() +　　　　 \\n with message: + e getMessage());　　　　　　　　return;　　　　　　　　/**　　　　* @return Returns the IndexFilePath 　　　　*/ public String getIndexFilePath() 　return IndexFilePath;　　　　　　　　/**　　　　* @param IndexFilePath The IndexFilePath to set 　　　　*/ public void setIndexFilePath(String property ) 　this IndexFilePath = property ;　　　　　　　　/**　　　　* @return Returns the DocsPath 　　　　*/ public String getDocsPath() 　return DocsPath;　　　　　　　　/**　　　　* @param DocsPath The DocsPath to set 　　　　*/ public void setDocsPath(String property ) 　this DocsPath = property ;　　　　　　　　/**　　　　* test　　　　* @param args　　　　*/ public static void main(String[] args)　IndexHTML ih=new IndexHTML();　　　　ih setDocsPath( D:\\\\MyProject\\\\colimas\\\\clms doc \\\\ );　　　　ih setIndexFilePath( D:\\\\MyProject\\\\colimas\\\\index );　ih run(true); 　　　　运行后生成个文件_ i cfs deletable segments　　　　搜索文件类　　　　/*　　　　* Created on / / 　　　　*　　　　* TODO To change the template for this generated file go to　　　　* Window Preferences Java Code Style Code Templates　　　　*/package limas search query;　　　　/** * @author tyrone * * TODO To change the template for this generated type ment go to　　　　* Window Preferences Java Code Style Code Templates　　　　*/public class HitsHTMLDoc 　private String Title;　　　　priva cha138/Article/program/Java/ky/201311/28449

知识大全 如何使用Lucene对html文件进行索引

知识大全如何使用Lucene对html文件进行索引