Lucene 3.0 入門實(shí)例

tinglu10 2010-12-18

展開全文

lucene3.0已于2009-11-25發(fā)布啦，但網(wǎng)上的入門實(shí)例都是針對lucene3.0以前的,，相對于以前的版本,，貌似改動不小。

本人從零開始學(xué)習(xí)《lucene in action中文版》,，并結(jié)合lucene3.0文檔寫了個(gè)入門實(shí)例,，可供像我一樣直接從lucene3.0開始學(xué)習(xí)的初學(xué)者參考！（變化大的地方用藍(lán)字標(biāo)出來了）

入門實(shí)例：

1.預(yù)處理：先把網(wǎng)上下載的一個(gè)《三國演義》電子書“三國演義.txt”（可用其他代替,，呵呵）切割成多個(gè)小文件,。
/**
* @author ht
* 預(yù)處理
*
*/
public class FilePreprocess {
public static void main(String[] arg){
String outputpath = "D:\\test\\small\\";//小文件存放路徑
String filename = "D:\\test\\三國演義.txt";//原文件存放路徑
if(!new File(outputpath).exists()){
new File(outputpath).mkdirs();
}

splitToSmallFiles(new File(filename), outputpath);
}
/**大文件切割為小的
* @param file
* @param outputpath
*/
public static void splitToSmallFiles(File file ,String outputpath){
int filePointer = 0;
int MAX_SIZE = 10240;//小文件大小
String filename = "output";//小文件的文件名前綴

BufferedWriter writer = null;
try {
BufferedReader reader = new BufferedReader(new FileReader(file));
StringBuffer buffer = new StringBuffer();
String line = reader.readLine();
while(line != null){
buffer.append(line).append("\r\n");
if(buffer.toString().getBytes().length>=MAX_SIZE){
writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt"));
writer.write(buffer.toString());
writer.close();
filePointer++;
buffer=new StringBuffer();
}
line = reader.readLine();
}
writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt"));
writer.write(buffer.toString());
writer.close();
System.out.println("The file hava splited to small files !");
} catch (FileNotFoundException e) {
System.out.println("file not found !");
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

2.用lucene3.0生成索引類:用lencene3.0對生成的多個(gè)小文件進(jìn)行索引，中文分詞用的是lucene3.0自帶的StandardAnalyzer.
/**
* @author ht
* 索引生成
*
*/
public class Indexer {
private static String INDEX_DIR = "D:\\test\\index";//索引存放目錄
private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目錄

public static void main(String[] args) throws Exception {

long start = new Date().getTime();
int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//調(diào)用index方法
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}

/**索引dataDir下的.txt文件,，并儲存在indexDir下,，返回索引的文件數(shù)量
* @param indexDir
* @param dataDir
* @return int
* @throws IOException
*/
public static int index(File indexDir, File dataDir) throws IOException {

if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir + " does not exist or is not a directory");
}

IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true,
IndexWriter.MaxFieldLength.LIMITED);

indexDirectory(writer, dataDir);//調(diào)用indexDirectory方法
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}

/**循環(huán)遍歷目錄下的所有.txt文件并進(jìn)行索引
* @param writer
* @param dir
* @throws IOException
*/
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {

File[] files = dir.listFiles();

for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(writer, f); // recurse
} else if (f.getName().endsWith(".txt")) {
indexFile(writer, f);
}
}
}

/**對單個(gè)txt文件進(jìn)行索引
* @param writer
* @param f
* @throws IOException
*/
private static void indexFile(IndexWriter writer, File f)
throws IOException {

if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}

System.out.println("Indexing " + f.getCanonicalPath());
Document doc = new Document();
doc.add(new Field("contents",new FileReader(f)));
doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));

writer.addDocument(doc);
}
}

3.查詢類：查詢“玄德”！
/**
* @author ht
* 查詢
*
*/
public class Searcher {
private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路徑
private static String KEYWORD = "玄德";//關(guān)鍵詞
private static int TOP_NUM = 100;//顯示前100條結(jié)果

public static void main(String[] args) throws Exception {
File indexDir = new File(INDEX_DIR);
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir +
" does not exist or is not a directory.");
}
search(indexDir, KEYWORD);//調(diào)用search方法進(jìn)行查詢
}
/**查詢
* @param indexDir
* @param q
* @throws Exception
*/
public static void search(File indexDir, String q) throws Exception {
IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir),true);//read-only
String field = "contents";

QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));
Query query = parser.parse(q);

TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);

long start = new Date().getTime();// start time

is.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;

System.out.println(hits.length);
for (int i = 0; i < hits.length; i++) {
Document doc = is.doc(hits[i].doc);//new method is.doc()
System.out.println(doc.getField("filename")+"   "+hits[i].toString()+" ");
}
long end = new Date().getTime();//end time

System.out.println("Found " + collector.getTotalHits() +
" document(s) (in " + (end - start) +
" milliseconds) that matched query '" +
q + "':");
}
}

本站是提供個(gè)人知識管理的網(wǎng)絡(luò)存儲空間,，所有內(nèi)容均由用戶發(fā)布,，不代表本站觀點(diǎn)。請注意甄別內(nèi)容中的聯(lián)系方式,、誘導(dǎo)購買等信息，謹(jǐn)防詐騙,。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,，請點(diǎn)擊一鍵舉報(bào)。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來自： tinglu10 > 《我的圖書館》

舉報(bào)/認(rèn)領(lǐng)