lucene3.0已于2009-11-25發(fā)布啦,但網(wǎng)上的入門實(shí)例都是針對lucene3.0以前的,,相對于以前的版本,,貌似改動不小。
本人從零開始學(xué)習(xí)《lucene in action中文版》,,并結(jié)合lucene3.0文檔寫了個(gè)入門實(shí)例,,可供像我一樣直接從lucene3.0開始學(xué)習(xí)的初學(xué)者參考!(變化大的地方用藍(lán)字標(biāo)出來了) 入門實(shí)例: 1.預(yù)處理:先把網(wǎng)上下載的一個(gè)《三國演義》電子書“三國演義.txt”(可用其他代替,,呵呵)切割成多個(gè)小文件,。 /** * @author ht * 預(yù)處理 * */ public class FilePreprocess { public static void main(String[] arg){ String outputpath = "D:\\test\\small\\";//小文件存放路徑 String filename = "D:\\test\\三國演義.txt";//原文件存放路徑 if(!new File(outputpath).exists()){ new File(outputpath).mkdirs(); } splitToSmallFiles(new File(filename), outputpath); } /**大文件切割為小的 * @param file * @param outputpath */ public static void splitToSmallFiles(File file ,String outputpath){ int filePointer = 0; int MAX_SIZE = 10240;//小文件大小 String filename = "output";//小文件的文件名前綴 BufferedWriter writer = null; try { BufferedReader reader = new BufferedReader(new FileReader(file)); StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while(line != null){ buffer.append(line).append("\r\n"); if(buffer.toString().getBytes().length>=MAX_SIZE){ writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); filePointer++; buffer=new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); System.out.println("The file hava splited to small files !"); } catch (FileNotFoundException e) { System.out.println("file not found !"); e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } 2.用lucene3.0生成索引類:用lencene3.0對生成的多個(gè)小文件進(jìn)行索引,中文分詞用的是lucene3.0自帶的StandardAnalyzer. /** * @author ht * 索引生成 * */ public class Indexer { private static String INDEX_DIR = "D:\\test\\index";//索引存放目錄 private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目錄 public static void main(String[] args) throws Exception { long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//調(diào)用index方法 long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } /**索引dataDir下的.txt文件,,并儲存在indexDir下,,返回索引的文件數(shù)量 * @param indexDir * @param dataDir * @return int * @throws IOException */ public static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); indexDirectory(writer, dataDir);//調(diào)用indexDirectory方法 int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /**循環(huán)遍歷目錄下的所有.txt文件并進(jìn)行索引 * @param writer * @param dir * @throws IOException */ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(".txt")) { indexFile(writer, f); } } } /**對單個(gè)txt文件進(jìn)行索引 * @param writer * @param f * @throws IOException */ private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("contents",new FileReader(f))); doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } } 3.查詢類:查詢“玄德”! /** * @author ht * 查詢 * */ public class Searcher { private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路徑 private static String KEYWORD = "玄德";//關(guān)鍵詞 private static int TOP_NUM = 100;//顯示前100條結(jié)果 public static void main(String[] args) throws Exception { File indexDir = new File(INDEX_DIR); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } search(indexDir, KEYWORD);//調(diào)用search方法進(jìn)行查詢 } /**查詢 * @param indexDir * @param q * @throws Exception */ public static void search(File indexDir, String q) throws Exception { IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir),true);//read-only String field = "contents"; QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT)); Query query = parser.parse(q); TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false); long start = new Date().getTime();// start time is.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println(hits.length); for (int i = 0; i < hits.length; i++) { Document doc = is.doc(hits[i].doc);//new method is.doc() System.out.println(doc.getField("filename")+" "+hits[i].toString()+" "); } long end = new Date().getTime();//end time System.out.println("Found " + collector.getTotalHits() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); } } |
|