知识大全 如何使用Lucene对html文件进行索引
Posted 文件
篇首语:春衣少年当酒歌,起舞四顾以笑和。本文由小常识网(cha138.com)小编为大家整理,主要介绍了知识大全 如何使用Lucene对html文件进行索引相关的知识,希望对你有一定的参考价值。
如何使用Lucene对html文件进行索引 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!
我修改了lucene的demo包的IndexHTML类 使其可以被其他Java类调用 IndexHTML类 import apache lucene analysis standard StandardAnalyzer; import apache lucene document Document; import apache lucene index IndexReader; import apache lucene index IndexWriter; import apache lucene index Term; import apache lucene index TermEnum; import java io File;import java util Date; import java util Arrays; //还需调用demo的其他类 import apache lucene demo; /** * Create file index for searching * @author tyrone * */public class IndexHTML private String DocsPath=null; /** * the path for index file; */ private String IndexFilePath=null; /** * true during deletion pass */ private boolean deleting = false; /** * existing index */ private IndexReader reader; /** * new index being built */ private IndexWriter writer; /** * document id iterator */ private TermEnum uidIter; private void indexDocs(File file)throws Exception if (file isDirectory()) // if a directory String[] files = file list(); // list its files Arrays sort(files); // sort the files for (int i = ; i < files length; i++) // recursively index them this indexDocs(new File(file files[i])); else if (file getPath() endsWith(l ) || // l files file getPath() endsWith( ) || // files file getPath() endsWith( txt )) // index txt files if (this uidIter != null) String uid = HTMLDocument uid(file); // construct uid for doc while (uidIter term() != null && uidIter term() field() == uid && uidIter term() text(pareTo(uid) < ) if (deleting) // delete stale docs System out println( deleting + HTMLDocument uid url(uidIter term() text())); reader delete(uidIter term()); uidIter next(); if (uidIter term() != null && uidIter term() field() == uid && uidIter term() text(pareTo(uid) == ) uidIter next(); // keep matching docs else if (!deleting) // add new docs Document doc = HTMLDocument Document(file); System out println( adding + doc get( url )); writer addDocument(doc); else // creating a new index Document doc = HTMLDocument Document(file); System out println( adding + doc get( url )); writer addDocument(doc); // add docs unconditionally return; /** * Walk directory hierarchy in uid order while keeping uid iterator from * existing index in sync Mismatches indicate one of: * (a) old documents to be deleted; * (b) unchanged documents to be left alone; * or (c) new documents to be indexed */ private void indexDocs(File file String index boolean create) throws Exception if (!create) // incrementally update reader = IndexReader open(index); // open existing index uidIter = reader terms(new Term( uid )); // init uid iterator this indexDocs(file); if (deleting) // delete rest of stale docs while (uidIter term() != null && uidIter term() field() == uid ) System out println( deleting + HTMLDocument uid url(uidIter term() text())); reader delete(uidIter term()); uidIter next(); deleting = false; uidIter close(); // close uid iterator reader close(); // close existing index else // don t have exisiting this indexDocs(file); /** * if create=true create a new index else refresh old index * @param create */ public void run(boolean create) try String index = index ; File root = null; if (this IndexFilePath!=null) // index file path index = this IndexFilePath; if (this DocsPath==null) System out println( root directory is not set ); return; root = new File(this DocsPath); Date start = new Date(); /** * not create then maintenance */ if (!create) // delete stale docs this deleting = true; this indexDocs(root index create); writer = new IndexWriter(index new StandardAnalyzer() create); writer maxFieldLength = ; this indexDocs(root index create); // add new docs System out println( Optimizing index ); writer optimize(); writer close(); Date end = new Date(); System out print(end getTime() start getTime()); System out println( total milliseconds ); catch (Exception e) System out println( caught a + e getClass() + \\n with message: + e getMessage()); return; /** * @return Returns the IndexFilePath */ public String getIndexFilePath() return IndexFilePath; /** * @param IndexFilePath The IndexFilePath to set */ public void setIndexFilePath(String property ) this IndexFilePath = property ; /** * @return Returns the DocsPath */ public String getDocsPath() return DocsPath; /** * @param DocsPath The DocsPath to set */ public void setDocsPath(String property ) this DocsPath = property ; /** * test * @param args */ public static void main(String[] args) IndexHTML ih=new IndexHTML(); ih setDocsPath( D:\\\\MyProject\\\\colimas\\\\clms doc \\\\ ); ih setIndexFilePath( D:\\\\MyProject\\\\colimas\\\\index ); ih run(true); 运行后生成 个文件_ i cfs deletable segments 搜索文件类 /* * Created on / / * * TODO To change the template for this generated file go to * Window Preferences Java Code Style Code Templates */package limas search query; /** * @author tyrone * * TODO To change the template for this generated type ment go to * Window Preferences Java Code Style Code Templates */public class HitsHTMLDoc private String Title; priva cha138/Article/program/Java/ky/201311/28449相关参考