├── .gitignore ├── .settings ├── org.eclipse.jdt.core.prefs └── org.eclipse.jdt.ui.prefs ├── HISTORY.md ├── README.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── clueweb │ │ ├── clueweb09 │ │ ├── ClueWeb09WarcRecord.java │ │ ├── app │ │ │ ├── CountWarcRecordsNew.java │ │ │ └── CountWarcRecordsOld.java │ │ ├── mapred │ │ │ └── ClueWeb09InputFormat.java │ │ └── mapreduce │ │ │ └── ClueWeb09InputFormat.java │ │ ├── clueweb12 │ │ ├── ClueWeb12WarcRecord.java │ │ ├── app │ │ │ ├── BuildDictionary.java │ │ │ ├── BuildPForDocVectors.java │ │ │ ├── BuildVByteDocVectors.java │ │ │ ├── BuildWarcTrecIdMapping.java │ │ │ ├── ComputeTermStatistics.java │ │ │ ├── CountWarcRecordsNew.java │ │ │ ├── CountWarcRecordsOld.java │ │ │ ├── DumpWarcRecordsToPlainText.java │ │ │ ├── DumpWarcRecordsToTermIds.java │ │ │ ├── LMRetrieval.java │ │ │ ├── LookupWarcTrecIdMapping.java │ │ │ ├── MergeTermStatistics.java │ │ │ ├── ProcessPForDocVectors.java │ │ │ └── ProcessVByteDocVectors.java │ │ ├── mapred │ │ │ └── ClueWeb12InputFormat.java │ │ └── mapreduce │ │ │ └── ClueWeb12InputFormat.java │ │ ├── data │ │ ├── DocVector.java │ │ ├── Indexable.java │ │ ├── PForDocVector.java │ │ ├── TermStatistics.java │ │ ├── VByteDocVector.java │ │ └── WarcTrecIdMapping.java │ │ ├── dictionary │ │ ├── DefaultFrequencySortedDictionary.java │ │ ├── Dictionary.java │ │ ├── DictionaryTransformationStrategy.java │ │ ├── FrequencySortedDictionary.java │ │ ├── FrontCodedDictionary.java │ │ ├── LexicographicallySortedDictionary.java │ │ └── PorterAnalyzer.java │ │ └── util │ │ ├── AnalyzerFactory.java │ │ └── QuickSort.java └── resources │ └── log4j.properties └── test └── java └── org └── clueweb ├── data ├── PForDocVectorTest.java └── VByteDocVectorTest.java └── dictionary └── PorterAnalyzerTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .classpath 3 | .project 4 | target/ 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/.settings/org.eclipse.jdt.core.prefs -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/.settings/org.eclipse.jdt.ui.prefs -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/HISTORY.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/README.md -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/pom.xml -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/ClueWeb09WarcRecord.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb09/ClueWeb09WarcRecord.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsOld.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsOld.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/mapred/ClueWeb09InputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb09/mapred/ClueWeb09InputFormat.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/mapreduce/ClueWeb09InputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb09/mapreduce/ClueWeb09InputFormat.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/ClueWeb12WarcRecord.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/ClueWeb12WarcRecord.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildDictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/BuildDictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildPForDocVectors.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/BuildPForDocVectors.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildVByteDocVectors.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/BuildVByteDocVectors.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildWarcTrecIdMapping.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/BuildWarcTrecIdMapping.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ComputeTermStatistics.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/ComputeTermStatistics.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsNew.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsNew.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsOld.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsOld.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToPlainText.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToPlainText.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToTermIds.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToTermIds.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/LMRetrieval.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/LMRetrieval.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/LookupWarcTrecIdMapping.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/LookupWarcTrecIdMapping.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/MergeTermStatistics.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/MergeTermStatistics.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ProcessPForDocVectors.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/ProcessPForDocVectors.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ProcessVByteDocVectors.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/app/ProcessVByteDocVectors.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/mapred/ClueWeb12InputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/mapred/ClueWeb12InputFormat.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/mapreduce/ClueWeb12InputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/clueweb12/mapreduce/ClueWeb12InputFormat.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/DocVector.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/DocVector.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/Indexable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/Indexable.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/PForDocVector.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/PForDocVector.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/TermStatistics.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/TermStatistics.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/VByteDocVector.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/VByteDocVector.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/WarcTrecIdMapping.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/data/WarcTrecIdMapping.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/DefaultFrequencySortedDictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/DefaultFrequencySortedDictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/Dictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/Dictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/DictionaryTransformationStrategy.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/DictionaryTransformationStrategy.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/FrequencySortedDictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/FrequencySortedDictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/FrontCodedDictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/FrontCodedDictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/LexicographicallySortedDictionary.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/LexicographicallySortedDictionary.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/PorterAnalyzer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/dictionary/PorterAnalyzer.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/util/AnalyzerFactory.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/util/AnalyzerFactory.java -------------------------------------------------------------------------------- /src/main/java/org/clueweb/util/QuickSort.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/java/org/clueweb/util/QuickSort.java -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /src/test/java/org/clueweb/data/PForDocVectorTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/test/java/org/clueweb/data/PForDocVectorTest.java -------------------------------------------------------------------------------- /src/test/java/org/clueweb/data/VByteDocVectorTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/test/java/org/clueweb/data/VByteDocVectorTest.java -------------------------------------------------------------------------------- /src/test/java/org/clueweb/dictionary/PorterAnalyzerTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/clueweb/HEAD/src/test/java/org/clueweb/dictionary/PorterAnalyzerTest.java --------------------------------------------------------------------------------