`
cfan_haifeng
  • 浏览: 120269 次
  • 性别: Icon_minigender_1
  • 来自: 郑州
社区版块
存档分类
最新评论

lucene-segments的文件格式分析

 
阅读更多

Lucene的索引文件格式(2)这篇文章写的非常好,参考他写了解析segments.gen和segments_1的代码。

代码如下(lucene版本为:lucene-core-3.4.0.jar)

 

 

package format;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IndexInput;

import constant.Constant;

/**
 * 读取segments.gen,segments_1中的信息
 * 参考:http://www.cnblogs.com/forfuture1978/archive/2009/12/14/1623599.html 索引文件格式分析
 */
public class SegmentFormatAnalysis {

	private SegmentFormatAnalysis() {
	}

	/**
	 * Index all text files under a directory.
	 * 
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		getSEGMENTS_GEN();
		// 打开索引文件夹
		Runtime.getRuntime().exec("cmd.exe /c start " + Constant.INDEX_PATH);

	}

	private static void readSegmentInfo(IndexInput input, int format)
			throws IOException {
		System.out.println("version:" + input.readString());
		String name = input.readString();
		System.out.println("SegName(段名):" + name);
		int docCount = input.readInt();
		System.out.println("SegSize(此段中包含的文档数):" + docCount);

		/****/
		System.out.println("format <= SegmentInfos.FORMAT_LOCKLESS:"
				+ (format <= SegmentInfos.FORMAT_LOCKLESS));
		if (format <= SegmentInfos.FORMAT_LOCKLESS) {
			long delGen = input.readLong();
			System.out.println("delGen :" + delGen);
		} else {
			System.err.println("format 有误");
			System.exit(-1);
		}// end o if else
		/****/

		/****/
		int docStoreOffset;
		String docStoreSegment;
		boolean docStoreIsCompoundFile;
		if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) {
			docStoreOffset = input.readInt();
			if (docStoreOffset != -1) {
				docStoreSegment = input.readString();
				docStoreIsCompoundFile = (1 == input.readByte());
			} else {
				docStoreSegment = name;
				docStoreIsCompoundFile = false;
			}
		} else {
			docStoreOffset = -1;
			docStoreSegment = name;
			docStoreIsCompoundFile = false;
		}
		System.out
				.println("docStoreOffset(如果DocStoreOffset为-1,则此段单独存储自己的域(Stored Field)和词向量(Term Vector)) :"
						+ docStoreOffset);
		System.out.println("docStoreSegment (docStoreSegment是域和词向量信息存储的段):"
				+ docStoreSegment);
		System.out.println("docStoreIsCompoundFile :" + docStoreIsCompoundFile);
		/****/

		/****/
		boolean hasSingleNormFile;
		if (format <= SegmentInfos.FORMAT_SINGLE_NORM_FILE) {
			hasSingleNormFile = (1 == input.readByte());
		} else {
			hasSingleNormFile = false;
		}
		System.out.println("hasSingleNormFile :" + hasSingleNormFile);
		/****/

		/****/
		int numNormGen = input.readInt();
		System.out.println("numNormGen :" + numNormGen);
		/****/

		byte isCompoundFile = input.readByte();
		System.out.println("isCompoundFile :" + isCompoundFile);
		int delCount;
		if (format <= SegmentInfos.FORMAT_DEL_COUNT) {
			delCount = input.readInt();
			assert delCount <= docCount;
		} else
			delCount = -1;

		System.out.println("delCount :" + delCount);

		boolean hasProx;
		if (format <= SegmentInfos.FORMAT_HAS_PROX)
			hasProx = input.readByte() == 1;
		else
			hasProx = true;

		System.out.println("hasProx :" + hasProx);

		Map<String, String> diagnostics;
		if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) {
			diagnostics = input.readStringStringMap();
		} else {
			diagnostics = Collections.<String, String> emptyMap();
		}

		//

		if (format <= SegmentInfos.FORMAT_HAS_VECTORS) {
			boolean hasVectors = input.readByte() == 1;
			System.out.println("hasVectors :" + hasVectors);
		} else {
			System.err.println("format 有误");
			System.exit(-1);
		}
	}// end of method

	/**
	 * 读取segments_0,segments_1信息
	 * 
	 * @param segmentFileName
	 * @throws IOException
	 */
	private static void getSEGMENTS_N(String segmentFileName)
			throws IOException {
		System.out.println("---------------------------");
		String indexPath = Constant.INDEX_PATH;
		Directory directory = FSDirectory.open(new File(indexPath));
		// String segmentFileName = "segments_1";
		ChecksumIndexInput input = new ChecksumIndexInput(directory
				.openInput(segmentFileName));
		// Lucene 2.1此值-3,Lucene 2.9时,此值为-9。
		int format = input.readInt();
		System.out.println("Format:" + format);
		System.out.println("Version(估计其实存储的时索引最后修改的时间毫秒数):" + input.readLong());
		// System.out.println("Version:" + System.currentTimeMillis());

		// 是下一个新段(Segment)的段名。
		// 所有属于同一个段的索引文件都以段名作为文件名,一般为_0.xxx, _0.yyy, _1.xxx, _1.yyy ……
		// 新生成的段的段名一般为原有最大段名加一。
		// 如同的索引,NameCount读出来是2,说明新的段为_2.xxx, _2.yyy
		System.out.println("NameCount:" + input.readInt());

		int segCount = input.readInt();
		System.out.println("SegCount(Segment的个数):" + segCount);
		for (int i = segCount; i > 0; i--) {
			System.out.println("第" + i + "段信息 begin+++++++++++");
			readSegmentInfo(input, format);
			System.out.println("第" + i + "段信息 end +++++++++++");
			// SegmentInfo si = new SegmentInfo(directory, format, input);
		}// end of for
		//		

		if (format >= 0) { // in old format the version number may be at the end
			long version = -250;
			if (input.getFilePointer() >= input.length())
				version = System.currentTimeMillis(); // old file format without
			else
				version = input.readLong(); // read version

			System.out.println("version:" + version);
		}

		// 保存了用户从字符串到字符串的映射Map
		Map<String, String> userData;
		if (format <= SegmentInfos.FORMAT_USER_DATA) {
			if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) {
				userData = input.readStringStringMap();
			} else if (0 != input.readByte()) {
				userData = Collections.singletonMap("userData", input
						.readString());
			} else {
				userData = Collections.<String, String> emptyMap();
			}
		} else {
			userData = Collections.<String, String> emptyMap();
		}
		// 遍历map-userData
		System.out.println("遍历userData:");
		Set<Map.Entry<String, String>> set = userData.entrySet();
		for (Iterator<Map.Entry<String, String>> it = set.iterator(); it
				.hasNext();) {
			Map.Entry<String, String> entry = (Map.Entry<String, String>) it
					.next();
			System.out.println(entry.getKey() + "--->" + entry.getValue());
		}// end of for

		if (format <= SegmentInfos.FORMAT_CHECKSUM) {
			final long checksumNow = input.getChecksum();
			final long checksumThen = input.readLong();
			System.out.println("checksumNow:" + checksumNow + "\tchecksumThen:"
					+ checksumThen);
			if (checksumNow != checksumThen)
				throw new CorruptIndexException(
						"checksum mismatch in segments file");
		}

		input.close();

	}// end of method

	/**
	 * 获取segments.gen中的信息
	 * 
	 * @throws IOException
	 */
	public static void getSEGMENTS_GEN() throws IOException {
		String indexPath = Constant.INDEX_PATH;
		Directory dir = FSDirectory.open(new File(indexPath));

		IndexInput genInput = dir.openInput(IndexFileNames.SEGMENTS_GEN);// "segments.gen"

		int version = genInput.readInt();// 读出版本号

		// Lucene 2.1此值-3,Lucene 2.9时,此值为-9。
		System.out.println("version:" + version);
		if (version == SegmentInfos.FORMAT_LOCKLESS) {// 如果版本号正确
			long gen0 = genInput.readLong();// 读出第一个N
			long gen1 = genInput.readLong();// 读出第二个N
			System.out.println("gen0:" + gen0 + "\tgen1:" + gen1);

			long gen = gen1;

			String segmentFileName = IndexFileNames.SEGMENTS + "_" + gen;
			System.out.println("*************************" + segmentFileName
					+ "分析结果为:" + "*************************");
			getSEGMENTS_N(segmentFileName);

			// if (gen0 == gen1) {//如果两者相等则为genB
			// genB = gen0;
			// }
		} else {
			System.out.println("version错误:" + version);
			System.exit(-1);
		}
	}

}
 

 

………………

 

分享到:
评论
1 楼 chris开到荼縻 2013-07-13  
有没有打包的代码啊?好长啊

相关推荐

    lucene-analyzers-smartcn-7.7.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-analyzers-smartcn-7.7.0.pom; 包含翻译后的API文档:lucene-analyzers-smartcn-7.7.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-analyzers-smartcn...

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-7.7.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-core-7.7.0.pom; 包含翻译后的API文档:lucene-core-7.7.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-core:7.7.0; 标签:apache、lucene、core、...

    lucene-analyzers-common-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-analyzers-common-6.6.0.pom; 包含翻译后的API文档:lucene-analyzers-common-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-analyzers-common:...

    lucene-core-7.2.1-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-core-7.2.1.pom; 包含翻译后的API文档:lucene-core-7.2.1-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-core:7.2.1; 标签:apache、lucene、core、...

    lucene-suggest-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-suggest-6.6.0.pom; 包含翻译后的API文档:lucene-suggest-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-suggest:6.6.0; 标签:apache、lucene...

    lucene-backward-codecs-7.3.1-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-backward-codecs-7.3.1.pom; 包含翻译后的API文档:lucene-backward-codecs-7.3.1-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-backward-...

    lucene-core-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-core-6.6.0.pom; 包含翻译后的API文档:lucene-core-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-core:6.6.0; 标签:core、apache、lucene、...

    lucene-highlighter-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-highlighter-6.6.0.pom; 包含翻译后的API文档:lucene-highlighter-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-highlighter:6.6.0; 标签:...

    lucene-memory-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-memory-6.6.0.pom; 包含翻译后的API文档:lucene-memory-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-memory:6.6.0; 标签:apache、memory、...

    lucene-analyzers-smartcn-7.7.0-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-analyzers-smartcn-7.7.0.pom; 包含翻译后的API文档:lucene-analyzers-smartcn-7.7.0-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-...

    lucene-spatial-extras-7.3.1-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-spatial-extras-7.3.1.pom; 包含翻译后的API文档:lucene-spatial-extras-7.3.1-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-spatial-...

    lucene-suggest-7.7.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-suggest-7.7.0.pom; 包含翻译后的API文档:lucene-suggest-7.7.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-suggest:7.7.0; 标签:apache、lucene...

    lucene-sandbox-7.2.1-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-sandbox-7.2.1.pom; 包含翻译后的API文档:lucene-sandbox-7.2.1-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-sandbox:7.2.1; 标签:apache、lucene...

    lucene-spatial-extras-7.2.1-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-spatial-extras-7.2.1.pom; 包含翻译后的API文档:lucene-spatial-extras-7.2.1-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-spatial-...

    lucene-spatial-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-spatial-6.6.0.pom; 包含翻译后的API文档:lucene-spatial-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-spatial:6.6.0; 标签:apache、lucene...

    lucene-misc-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-misc-6.6.0.pom; 包含翻译后的API文档:lucene-misc-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-misc:6.6.0; 标签:apache、lucene、misc、...

    lucene-spatial-extras-6.6.0-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-spatial-extras-6.6.0.pom; 包含翻译后的API文档:lucene-spatial-extras-6.6.0-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-spatial-...

    lucene-backward-codecs-6.6.0-API文档-中英对照版.zip

    赠送Maven依赖信息文件:lucene-backward-codecs-6.6.0.pom; 包含翻译后的API文档:lucene-backward-codecs-6.6.0-javadoc-API文档-中文(简体)-英语-对照版.zip; Maven坐标:org.apache.lucene:lucene-backward-...

    lucene-backward-codecs-6.6.0-API文档-中文版.zip

    赠送Maven依赖信息文件:lucene-backward-codecs-6.6.0.pom; 包含翻译后的API文档:lucene-backward-codecs-6.6.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-backward-codecs:6.6.0...

Global site tag (gtag.js) - Google Analytics