本文共 5317 字,大约阅读时间需要 17 分钟。
相当于一个百度搜索系统
几个名词解释
Lucene简介
第一步,创建索引和查询索引
wget -o /tmp/wget.log -P /root/data --no-parent --no-verbose -m -D www.bjsxt.com -N --convert-links --random-wait -A html,HTML http://www.bjsxt.com
package com.sxt.lucene;import org.apache.commons.io.FileUtils;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.LongField;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import java.io.File;import java.io.IOException;/** * @author: ZouTai * @date: 2018/3/28 * @description: 创建索引 */public class CreateIndex { // 静态变量,资源位置 static String dataDir = "E:/JavaEE_IJ_WorkSpace/lucene/Data/data"; static String indexDir = "E:/JavaEE_IJ_WorkSpace/lucene/Data/index"; @Test public void createIndex() { try { // 文件和分析器 Directory dir = FSDirectory.open(new File(indexDir)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); // 写入索引配置 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter indexWriter = new IndexWriter(dir, indexWriterConfig); // 遍历文件 File file = new File(dataDir); File[] files = file.listFiles(); for(File f : files) { Document document = new Document(); // 文件名、内容、最后修改时间 document.add(new StringField("filename", f.getName(), Field.Store.YES)); document.add(new TextField("content", FileUtils.readFileToString(f), Field.Store.YES)); document.add(new LongField("lastModify", f.lastModified(), Field.Store.YES)); indexWriter.addDocument(document); } indexWriter.close(); } catch (IOException e) { e.printStackTrace(); } }}
package com.sxt.lucene;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import java.io.File;/** * @author: ZouTai * @date: 2018/3/29 * @description: 查询索引 */public class SearchIndex { @Test public void searchIndex() { try { Directory directory = FSDirectory.open(new File(CreateIndex.indexDir)); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); QueryParser queryParser = new QueryParser(Version.LUCENE_4_9, "content", analyzer); Query query = queryParser.parse("form"); TopDocs topDocs = indexSearcher.search(query, 10); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc sd : scoreDocs) { int docId = sd.doc; Document document = indexReader.document(docId); System.out.println(document.get("filename")); } } catch (Exception e) { e.printStackTrace(); } }}
转载地址:http://oqepi.baihongyu.com/