this is a extra element for clear the floated element
如何使用Lucene对html文件进行索引
  • 12/31
  • 2008
Tomcat | Java 2310 次查看
  我修改了lucene的demo包的IndexHTML类,使其可以被其他Java类调用。

  IndexHTML类

  import org.apache.lucene.analysis.standard.StandardAnalyzer;

  import org.apache.lucene.document.Document;

  import org.apache.lucene.index.IndexReader;

  import org.apache.lucene.index.IndexWriter;

  import org.apache.lucene.index.Term;

  import org.apache.lucene.index.TermEnum;

  import java.io.File;import java.util.Date;

  import java.util.Arrays;

  //还需调用demo的其他类。

  import org.apache.lucene.demo;

  /**

  * Create html file index for searching

  * @author tyrone

  *

  */public class IndexHTML { private String DocsPath=null;

  /**

  * the path for index file;

  */ private String IndexFilePath=null;

  /**

  * true during deletion pass

  */

  private boolean deleting = false;

  /**

  * existing index

  */

  private IndexReader reader;

  /**

  * new index being built

  */

  private IndexWriter writer;

  /**

  * document id iterator

  */

  private TermEnum uidIter;

  private void indexDocs(File file)throws Exception {

  if (file.isDirectory())

  {

  // if a directory

  String[] files = file.list();

  // list its files

  Arrays.sort(files);

  // sort the files

  for (int i = 0; i < files.length;

  i++)

  // recursively index them

  this.indexDocs(new File(file, files[i]));

  } else if (file.getPath().endsWith(".html") || // index .html files

  file.getPath().endsWith(".htm") || // index .htm files

  file.getPath().endsWith(".txt")) { // index .txt files

  if (this.uidIter != null) {

  String uid = HTMLDocument.uid(file);

  // construct uid for doc

  while (uidIter.term() != null && uidIter.term().field() == "uid" &&

  uidIter.term().text().compareTo(uid) <0) {

  if (deleting) {

  // delete stale docs

  System.out.println("deleting " +

  HTMLDocument.uid2url(uidIter.term().text()));

  reader.delete(uidIter.term());

  }

  uidIter.next();

  }

  if (uidIter.term() != null && uidIter.term().field() == "uid" &&

  uidIter.term().text().compareTo(uid) == 0) {

  uidIter.next();

  // keep matching docs

  } else if (!deleting) {

  // add new docs

  Document doc = HTMLDocument.Document(file);

  System.out.println("adding " + doc.get("url"));

  writer.addDocument(doc);

  }

  } else { // creating a new index

  Document doc = HTMLDocument.Document(file);

  System.out.println("adding " + doc.get("url"));

  writer.addDocument(doc);

  // add docs unconditionally

  }

  }

  return;

  }

  /**

  * Walk directory hierarchy in uid order, while keeping uid iterator from

  * existing index in sync.

  Mismatches indicate one of:

  * (a) old documents to be deleted;

  * (b) unchanged documents, to be left alone;

  * or (c) new documents, to be indexed.

  */

  private void indexDocs(File file, String index, boolean create)

  throws Exception {

  if (!create) {

  // incrementally update

  reader = IndexReader.open(index);

  // open existing index

  uidIter = reader.terms(new Term("uid", ""));

  // init uid iterator

  this.indexDocs(file);

  if (deleting) {

  // delete rest of stale docs

  while (uidIter.term() != null && uidIter.term().field() == "uid") {

  System.out.println("deleting " +

  HTMLDocument.uid2url(uidIter.term().text()));

  reader.delete(uidIter.term());

  uidIter.next();

  }

  deleting = false;

  }

  uidIter.close();

  // close uid iterator

  reader.close();

  // close existing index

  } else

  // don't have exisiting

  this.indexDocs(file);

  }

  /**

  * if create=true, create a new index, else refresh old index.

  * @param create

  */ public void run(boolean create)

  {

  try {

  String index = "index";

  File root = null;

  if (this.IndexFilePath!=null)

  {

  // index file path

  index = this.IndexFilePath;

  }

  if (this.DocsPath==null){

  System.out.println("root directory is not set");

  return;

  }

  root = new File(this.DocsPath);

  Date start = new Date();

  /**

  * not create then maintenance

  */

  if (!create) {

  // delete stale docs

  this.deleting = true;

  this.indexDocs(root, index, create);

  }

  writer = new IndexWriter(index, new StandardAnalyzer(), create);

  writer.maxFieldLength = 1000000;

  this.indexDocs(root, index, create);

  // add new docs

  System.out.println("Optimizing index...");

  writer.optimize();

  writer.close();

  Date end = new Date();

  System.out.print(end.getTime() - start.getTime());

  System.out.println(" total milliseconds");

  } catch (Exception e) {

  System.out.println(" caught a " + e.getClass() +

  "\n with message: " + e.getMessage());

  }

  return;

  }

  /**

  * @return Returns the IndexFilePath.

  */ public String getIndexFilePath() {

  return IndexFilePath;

  }

  /**

  * @param IndexFilePath The IndexFilePath to set.

  */ public void setIndexFilePath(String property1) {

  this.IndexFilePath = property1;

  }

  /**

  * @return Returns the DocsPath.

  */ public String getDocsPath() {

  return DocsPath;

  }

  /**

  * @param DocsPath The DocsPath to set.

  */ public void setDocsPath(String property1) {

  this.DocsPath = property1;

  }

  /**

  * test

  * @param args

  */ public static void main(String[] args){

  IndexHTML ih=new IndexHTML();

  ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");

  ih.setIndexFilePath("D:\\MyProject\\colimas\\index");

  ih.run(true); }}

  运行后生成3个文件_3i8.cfs,deletable,segments

  搜索文件类:

  /*

  * Created on 2005/07/28

  *

  * TODO To change the template for this generated file go to

  * Window - Preferences - Java - Code Style - Code Templates

  */package com.nova.colimas.search.query;

  /** * @author tyrone * * TODO To change the template for this generated type comment go to

  * Window - Preferences - Java - Code Style - Code Templates

  */public class HitsHTMLDoc {

  private String Title;

  priva