├── .gitignore ├── .settings ├── org.eclipse.jdt.core.prefs └── org.eclipse.jdt.ui.prefs ├── HISTORY.md ├── README.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── clueweb │ │ ├── clueweb09 │ │ ├── ClueWeb09WarcRecord.java │ │ ├── app │ │ │ ├── CountWarcRecordsNew.java │ │ │ └── CountWarcRecordsOld.java │ │ ├── mapred │ │ │ └── ClueWeb09InputFormat.java │ │ └── mapreduce │ │ │ └── ClueWeb09InputFormat.java │ │ ├── clueweb12 │ │ ├── ClueWeb12WarcRecord.java │ │ ├── app │ │ │ ├── BuildDictionary.java │ │ │ ├── BuildPForDocVectors.java │ │ │ ├── BuildVByteDocVectors.java │ │ │ ├── BuildWarcTrecIdMapping.java │ │ │ ├── ComputeTermStatistics.java │ │ │ ├── CountWarcRecordsNew.java │ │ │ ├── CountWarcRecordsOld.java │ │ │ ├── DumpWarcRecordsToPlainText.java │ │ │ ├── DumpWarcRecordsToTermIds.java │ │ │ ├── LMRetrieval.java │ │ │ ├── LookupWarcTrecIdMapping.java │ │ │ ├── MergeTermStatistics.java │ │ │ ├── ProcessPForDocVectors.java │ │ │ └── ProcessVByteDocVectors.java │ │ ├── mapred │ │ │ └── ClueWeb12InputFormat.java │ │ └── mapreduce │ │ │ └── ClueWeb12InputFormat.java │ │ ├── data │ │ ├── DocVector.java │ │ ├── Indexable.java │ │ ├── PForDocVector.java │ │ ├── TermStatistics.java │ │ ├── VByteDocVector.java │ │ └── WarcTrecIdMapping.java │ │ ├── dictionary │ │ ├── DefaultFrequencySortedDictionary.java │ │ ├── Dictionary.java │ │ ├── DictionaryTransformationStrategy.java │ │ ├── FrequencySortedDictionary.java │ │ ├── FrontCodedDictionary.java │ │ ├── LexicographicallySortedDictionary.java │ │ └── PorterAnalyzer.java │ │ └── util │ │ ├── AnalyzerFactory.java │ │ └── QuickSort.java └── resources │ └── log4j.properties └── test └── java └── org └── clueweb ├── data ├── PForDocVectorTest.java └── VByteDocVectorTest.java └── dictionary └── PorterAnalyzerTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .classpath 3 | .project 4 | target/ 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | formatter_profile=_clueweb 3 | formatter_settings_version=12 4 | org.eclipse.jdt.ui.exception.name=e 5 | org.eclipse.jdt.ui.gettersetter.use.is=true 6 | org.eclipse.jdt.ui.keywordthis=false 7 | org.eclipse.jdt.ui.overrideannotation=true 8 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | Version 0.3 2 | =========== 3 | July 28, 2013 4 | 5 | + Added incomplete/untested support for ClueWeb09 (refactored package structure) 6 | + Added Basic LM Retrieval (currently hard-coded for PForDocVectors) 7 | 8 | Version 0.2 9 | =========== 10 | July 14, 2013 11 | 12 | + Refactored package layout, separating ClueWeb12-specific classes 13 | + Added PFor-compressed DocVector, in addition to the previous VByte-compressed version 14 | 15 | Version 0.1 16 | =========== 17 | July 10, 2013 18 | 19 | + Initial release 20 | + Ability to build global dictionary on ClueWeb12 21 | + Ability to convert ClueWeb12 into document vectors (with termid representation) 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ClueWeb Tools 2 | ============= 3 | 4 | Hadoop tools for manipulating ClueWeb collections, the most recent of which is [ClueWeb12 collection](http://lemurproject.org/clueweb12/). 5 | 6 | Sign up to the mailing list at [the clueweb-list@cwi.nl mailman page](https://lists.cwi.nl/mailman/listinfo/clueweb-list). 7 | 8 | Getting Stated 9 | -------------- 10 | 11 | You can clone the repo with the following command: 12 | 13 | ``` 14 | $ git clone git://github.com/lintool/clueweb.git 15 | ``` 16 | 17 | Once you've cloned the repository, build the package with Maven: 18 | 19 | ``` 20 | $ mvn clean package appassembler:assemble 21 | ``` 22 | 23 | Two notes: 24 | 25 | + `appassembler:assemble` automatically generates a few launch scripts for you. 26 | + in addition to the normal jar (`clueweb-tools-0.X-SNAPSHOT.jar`), this package uses the [Maven Shade plugin](http://maven.apache.org/plugins/maven-shade-plugin/) to create a "fat jar" (`clueweb-tools-0.X-SNAPSHOT-fatjar.jar`) that includes all dependencies except for Hadoop, so that the jar can be directly submitted via `hadoop jar ...`. 27 | 28 | To automatically generate project files for Eclipse: 29 | 30 | ``` 31 | $ mvn eclipse:clean 32 | $ mvn eclipse:eclipse 33 | ``` 34 | 35 | You can then use Eclipse's Import "Existing Projects into Workspace" functionality to import the project. 36 | 37 | Counting Records 38 | ---------------- 39 | 40 | For sanity checking and as a "template" for other Hadoop jobs, the package provides a simple program to count WARC records in ClueWeb12: 41 | 42 | ``` 43 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 44 | org.clueweb.clueweb12.app.CountClueWarcRecords -input /path/to/warc/files/ 45 | ``` 46 | 47 | Examples of `/path/to/warc/files/` are: 48 | 49 | + `/data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz`: for a single ClueWeb12 segment 50 | + `/data/private/clueweb12/Disk1/ClueWeb12_*/*/*.warc.gz`: for an entire ClueWeb12 disk 51 | + `/data/private/clueweb12/Disk[1234]/ClueWeb12_*/*/*.warc.gz`: for all of ClueWeb12 52 | 53 | Building a Dictionary 54 | --------------------- 55 | 56 | The next step is to build a dictionary that provides three capabilities: 57 | 58 | + a bidirectional mapping from terms (strings) to termids (integers) 59 | + lookup of document frequency (*df*) by term or termid 60 | + lookup of collection frequency (*cf*) by term or termid 61 | 62 | To build the dictionary, we must first compute the term statistics: 63 | 64 | ``` 65 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 66 | org.clueweb.clueweb12.app.ComputeTermStatistics \ 67 | -input /data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz \ 68 | -output term-stats/segment00 69 | ``` 70 | 71 | By default, the program throws away all terms with *df* less than 100, but this parameter can be set on the command line. The above command compute term statistics for a segment of ClueWeb12. It's easier to compute term statistics segment by segment to generate smaller and more manageable Hadoop jobs. 72 | 73 | Compute term statistics for all the other segments in the same manner. 74 | 75 | Next, merge all the segment statistics together: 76 | 77 | ``` 78 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 79 | org.clueweb.clueweb12.app.MergeTermStatistics \ 80 | -input term-stats/segment* -output term-stats-all 81 | ``` 82 | 83 | Finally, build the dictionary: 84 | 85 | ``` 86 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 87 | org.clueweb.clueweb12.app.BuildDictionary \ 88 | -input term-stats-all -output dictionary -count 7160086 89 | ``` 90 | 91 | You need to provide the number of terms in the dictionary via the `-count` option. That value is simply the number of output reducers from `MergeTermStatistics`. 92 | 93 | To explore the contents of the dictionary, use this little interactive program: 94 | 95 | ``` 96 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 97 | org.clueweb.clueweb12.dictionary.DefaultFrequencySortedDictionary dictionary 98 | ``` 99 | 100 | On ClueWeb12, following the above instructions will create a dictionary with 7,160,086 terms. 101 | 102 | 103 | **Implementation details:** Tokenization is performed by first using Jsoup throw away all markup information and then passing the resulting text through Lucene's `StandardAnalyzer`. 104 | 105 | The dictionary has two components: the terms are stored as a front-coded list (which necessarily means that the terms must be sorted); a monotone minimal perfect hash function is used to hash terms (strings) into the lexicographic position. Term to termid lookup is accomplished by the hashing function (to avoid binary searching through the front-coded data structure, which is expensive). Termid to term lookup is accomplished by direct accesses into the front-coded list. An additional mapping table is used to convert the lexicographic position into the (*df*-sorted) termid. 106 | 107 | Building Document Vectors 108 | ------------------------- 109 | 110 | With the dictionary, we can now convert the entire collection into a sequence of document vectors, where each document vector is represented by a sequence of termids; the termids map to the sequence of terms that comprise the document. These document vectors are much more compact and much faster to scan for processing purposes. 111 | 112 | The document vector is represented by the interface `org.clueweb.data.DocVector`. Currently, there are two concrete implementations: 113 | 114 | + `VByteDocVector`, which uses Hadoop's built-in utilities for writing variable-length integers (what Hadoop calls VInt). 115 | + `PForDocVector`, which uses PFor compression from Daniel Lemire's [JavaFastPFOR](https://github.com/lemire/JavaFastPFOR/) package. 116 | 117 | To build document vectors, use either `BuildVByteDocVectors` or `BuildPForDocVectors`: 118 | 119 | ``` 120 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \ 121 | org.clueweb.clueweb12.app.Build{VByte,PFor}DocVectors \ 122 | -input /data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz \ 123 | -output /data/private/clueweb12/derived/docvectors/segment00 \ 124 | -dictionary /data/private/clueweb12/derived/dictionary \ 125 | -reducers 100 126 | ``` 127 | 128 | Once again, it's advisable to run on a segment at a time in order to keep the Hadoop job sizes manageable. Note that the program run identity reducers to repartition the document vectors into 100 parts (to avoid the small files problem). 129 | 130 | The output directory will contain `SequenceFile`s, with a `Text` containing the WARC-TREC-ID as the key. For VByte, the value will be a `BytesWritable` object; for PFor, the value will be an `IntArrayWritable` object. 131 | 132 | To process these document vectors, either use `ProcessVByteDocVectors` or `ProcessPForDocVectors` in the `org.clueweb.clueweb12.app` package, which provides sample code for consuming these document vectors and converting the termids back into terms. 133 | 134 | Size comparisons, on the entire ClueWeb12 collection: 135 | 136 | + 5.54 TB: original compressed WARC files 137 | + 1.08 TB: repackaged as `VByteDocVector`s 138 | + 0.86 TB: repackaged as `PForDocVector`s 139 | + ~1.6 TB: uncompressed termids (collection size is ~400 billion terms) 140 | 141 | License 142 | ------- 143 | 144 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 145 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | org.clueweb 4 | clueweb-tools 5 | jar 6 | 0.4-SNAPSHOT 7 | clueweb-tools 8 | Hadoop tools for working with the ClueWeb 2012 collection 9 | http://clueweb.org/ 10 | 11 | 12 | 13 | The Apache Software License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0.txt 15 | repo 16 | 17 | 18 | 19 | 20 | scm:git:git@github.com:lintool/clueweb.git 21 | scm:git:git@github.com:lintool/clueweb.git 22 | git@github.com:lintool/clueweb.git 23 | 24 | 25 | 26 | 27 | lintool 28 | Jimmy Lin 29 | jimmylin@umd.edu 30 | 31 | 32 | 33 | 34 | org.sonatype.oss 35 | oss-parent 36 | 7 37 | 38 | 39 | 40 | 41 | default 42 | 43 | true 44 | 45 | 46 | 47 | 48 | org.codehaus.mojo 49 | appassembler-maven-plugin 50 | 1.3.1 51 | 52 | 53 | 54 | org.clueweb.clueweb12.app.BuildWarcTrecIdMapping 55 | BuildWarcTrecIdMapping 56 | 57 | 4g 58 | 59 | 60 | 61 | 62 | 63 | 64 | org.apache.maven.plugins 65 | maven-compiler-plugin 66 | 3.1 67 | 68 | 1.6 69 | 1.6 70 | 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-shade-plugin 75 | 2.1 76 | 77 | 78 | package 79 | 80 | shade 81 | 82 | 83 | 84 | true 85 | fatjar 86 | 87 | 88 | org.apache.hadoop:* 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | deploy 100 | 101 | 102 | 103 | 104 | 105 | UTF-8 106 | UTF-8 107 | 108 | 109 | 110 | 111 | junit 112 | junit 113 | 4.11 114 | test 115 | 116 | 117 | org.apache.ant 118 | ant 119 | 1.9.1 120 | 121 | 122 | commons-cli 123 | commons-cli 124 | 1.2 125 | 126 | 127 | commons-io 128 | commons-io 129 | 2.4 130 | 131 | 132 | commons-lang 133 | commons-lang 134 | 2.6 135 | 136 | 137 | org.apache.lucene 138 | lucene-core 139 | 4.3.1 140 | 141 | 142 | org.apache.lucene 143 | lucene-queryparser 144 | 4.3.1 145 | 146 | 147 | org.apache.lucene 148 | lucene-analyzers-common 149 | 4.3.1 150 | 151 | 152 | log4j 153 | log4j 154 | 1.2.17 155 | 156 | 157 | org.apache.hadoop 158 | hadoop-core 159 | 1.1.2 160 | 161 | 162 | org.apache.hadoop 163 | hadoop-client 164 | 1.1.2 165 | 166 | 167 | org.jsoup 168 | jsoup 169 | 1.7.2 170 | 171 | 172 | com.google.guava 173 | guava 174 | 14.0.1 175 | 176 | 177 | it.unimi.dsi 178 | dsiutils 179 | 2.0.15 180 | true 181 | 182 | 183 | it.unimi.dsi 184 | sux4j 185 | 3.0.8 186 | true 187 | 188 | 189 | it.unimi.dsi 190 | fastutil 191 | 6.5.4 192 | true 193 | 194 | 195 | tl.lin 196 | lintools-datatypes 197 | 0.9.2 198 | 199 | 200 | tl.lin 201 | lintools-lucene 202 | 0.1.0 203 | 204 | 205 | me.lemire.integercompression 206 | JavaFastPFOR 207 | 0.0.3 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb09.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.io.LongWritable; 31 | import org.apache.hadoop.io.NullWritable; 32 | import org.apache.hadoop.mapreduce.Counters; 33 | import org.apache.hadoop.mapreduce.Job; 34 | import org.apache.hadoop.mapreduce.Mapper; 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 36 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 37 | import org.apache.hadoop.util.Tool; 38 | import org.apache.hadoop.util.ToolRunner; 39 | import org.apache.log4j.Logger; 40 | import org.clueweb.clueweb09.ClueWeb09WarcRecord; 41 | import org.clueweb.clueweb09.mapreduce.ClueWeb09InputFormat; 42 | 43 | public class CountWarcRecordsNew extends Configured implements Tool { 44 | private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class); 45 | 46 | private static enum Records { TOTAL, PAGES }; 47 | 48 | private static class MyMapper 49 | extends Mapper { 50 | @Override 51 | public void map(LongWritable key, ClueWeb09WarcRecord doc, Context context) 52 | throws IOException, InterruptedException { 53 | context.getCounter(Records.TOTAL).increment(1); 54 | 55 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 56 | if (docid != null) { 57 | context.getCounter(Records.PAGES).increment(1); 58 | } 59 | } 60 | } 61 | 62 | public CountWarcRecordsNew() {} 63 | 64 | public static final String INPUT_OPTION = "input"; 65 | 66 | /** 67 | * Runs this tool. 68 | */ 69 | @SuppressWarnings("static-access") 70 | public int run(String[] args) throws Exception { 71 | Options options = new Options(); 72 | 73 | options.addOption(OptionBuilder.withArgName("path").hasArg() 74 | .withDescription("input path").create(INPUT_OPTION)); 75 | 76 | CommandLine cmdline; 77 | CommandLineParser parser = new GnuParser(); 78 | try { 79 | cmdline = parser.parse(options, args); 80 | } catch (ParseException exp) { 81 | HelpFormatter formatter = new HelpFormatter(); 82 | formatter.printHelp(this.getClass().getName(), options); 83 | ToolRunner.printGenericCommandUsage(System.out); 84 | System.err.println("Error parsing command line: " + exp.getMessage()); 85 | return -1; 86 | } 87 | 88 | if (!cmdline.hasOption(INPUT_OPTION)) { 89 | HelpFormatter formatter = new HelpFormatter(); 90 | formatter.printHelp(this.getClass().getName(), options); 91 | ToolRunner.printGenericCommandUsage(System.out); 92 | return -1; 93 | } 94 | 95 | String input = cmdline.getOptionValue(INPUT_OPTION); 96 | 97 | LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName()); 98 | LOG.info(" - input: " + input); 99 | 100 | Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input); 101 | job.setJarByClass(CountWarcRecordsNew.class); 102 | job.setNumReduceTasks(0); 103 | 104 | FileInputFormat.addInputPaths(job, input); 105 | 106 | job.setInputFormatClass(ClueWeb09InputFormat.class); 107 | job.setOutputFormatClass(NullOutputFormat.class); 108 | job.setMapperClass(MyMapper.class); 109 | 110 | job.waitForCompletion(true); 111 | 112 | Counters counters = job.getCounters(); 113 | int numDocs = (int) counters.findCounter(Records.PAGES).getValue(); 114 | LOG.info("Read " + numDocs + " docs."); 115 | 116 | return 0; 117 | } 118 | 119 | /** 120 | * Dispatches command-line arguments to the tool via the ToolRunner. 121 | */ 122 | public static void main(String[] args) throws Exception { 123 | LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args " 124 | + Arrays.toString(args)); 125 | ToolRunner.run(new CountWarcRecordsNew(), args); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsOld.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb09.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.io.NullWritable; 31 | import org.apache.hadoop.io.Writable; 32 | import org.apache.hadoop.mapred.Counters; 33 | import org.apache.hadoop.mapred.FileInputFormat; 34 | import org.apache.hadoop.mapred.JobClient; 35 | import org.apache.hadoop.mapred.JobConf; 36 | import org.apache.hadoop.mapred.MapReduceBase; 37 | import org.apache.hadoop.mapred.Mapper; 38 | import org.apache.hadoop.mapred.OutputCollector; 39 | import org.apache.hadoop.mapred.Reporter; 40 | import org.apache.hadoop.mapred.RunningJob; 41 | import org.apache.hadoop.mapred.lib.NullOutputFormat; 42 | import org.apache.hadoop.util.Tool; 43 | import org.apache.hadoop.util.ToolRunner; 44 | import org.apache.log4j.Logger; 45 | import org.clueweb.clueweb09.ClueWeb09WarcRecord; 46 | import org.clueweb.clueweb09.mapred.ClueWeb09InputFormat; 47 | 48 | public class CountWarcRecordsOld extends Configured implements Tool { 49 | private static final Logger LOG = Logger.getLogger(CountWarcRecordsOld.class); 50 | 51 | private static enum Records { TOTAL, PAGES }; 52 | 53 | private static class MyMapper extends MapReduceBase implements 54 | Mapper { 55 | 56 | public void configure(JobConf job) {} 57 | 58 | public void map(Writable key, ClueWeb09WarcRecord doc, 59 | OutputCollector output, Reporter reporter) throws IOException { 60 | reporter.incrCounter(Records.TOTAL, 1); 61 | 62 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 63 | if (docid != null) { 64 | reporter.incrCounter(Records.PAGES, 1); 65 | } 66 | } 67 | } 68 | 69 | public CountWarcRecordsOld() { 70 | } 71 | 72 | public static final String INPUT_OPTION = "input"; 73 | 74 | /** 75 | * Runs this tool. 76 | */ 77 | @SuppressWarnings("static-access") 78 | public int run(String[] args) throws Exception { 79 | Options options = new Options(); 80 | 81 | options.addOption(OptionBuilder.withArgName("path").hasArg() 82 | .withDescription("input path").create(INPUT_OPTION)); 83 | 84 | CommandLine cmdline; 85 | CommandLineParser parser = new GnuParser(); 86 | try { 87 | cmdline = parser.parse(options, args); 88 | } catch (ParseException exp) { 89 | HelpFormatter formatter = new HelpFormatter(); 90 | formatter.printHelp(this.getClass().getName(), options); 91 | ToolRunner.printGenericCommandUsage(System.out); 92 | System.err.println("Error parsing command line: " + exp.getMessage()); 93 | return -1; 94 | } 95 | 96 | if (!cmdline.hasOption(INPUT_OPTION)) { 97 | HelpFormatter formatter = new HelpFormatter(); 98 | formatter.printHelp(this.getClass().getName(), options); 99 | ToolRunner.printGenericCommandUsage(System.out); 100 | return -1; 101 | } 102 | 103 | String input = cmdline.getOptionValue(INPUT_OPTION); 104 | 105 | LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName()); 106 | LOG.info(" - input: " + input); 107 | 108 | JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class); 109 | conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input); 110 | 111 | conf.setNumReduceTasks(0); 112 | 113 | FileInputFormat.addInputPaths(conf, input); 114 | 115 | conf.setInputFormat(ClueWeb09InputFormat.class); 116 | conf.setOutputFormat(NullOutputFormat.class); 117 | conf.setMapperClass(MyMapper.class); 118 | 119 | RunningJob job = JobClient.runJob(conf); 120 | Counters counters = job.getCounters(); 121 | int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); 122 | 123 | LOG.info("Read " + numDocs + " docs."); 124 | 125 | return 0; 126 | } 127 | 128 | /** 129 | * Dispatches command-line arguments to the tool via the ToolRunner. 130 | */ 131 | public static void main(String[] args) throws Exception { 132 | LOG.info("Running " + CountWarcRecordsOld.class.getCanonicalName() + " with args " 133 | + Arrays.toString(args)); 134 | ToolRunner.run(new CountWarcRecordsOld(), args); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/mapred/ClueWeb09InputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | /* 18 | * Hadoop FileInputFormat for reading WARC files 19 | * 20 | * (C) 2009 - Carnegie Mellon University 21 | * 22 | * 1. Redistributions of this source code must retain the above copyright 23 | * notice, this list of conditions and the following disclaimer. 24 | * 2. The names "Lemur", "Indri", "University of Massachusetts", 25 | * "Carnegie Mellon", and "lemurproject" must not be used to 26 | * endorse or promote products derived from this software without 27 | * prior written permission. To obtain permission, contact 28 | * license@lemurproject.org. 29 | * 30 | * 4. Products derived from this software may not be called "Lemur" or "Indri" 31 | * nor may "Lemur" or "Indri" appear in their names without prior written 32 | * permission of The Lemur Project. To obtain permission, 33 | * contact license@lemurproject.org. 34 | * 35 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09 36 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 37 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 38 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 39 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 40 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 41 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 42 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 45 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 46 | * POSSIBILITY OF SUCH DAMAGE. 47 | * 48 | * @author mhoy@cs.cmu.edu (Mark J. Hoy) 49 | */ 50 | 51 | package org.clueweb.clueweb09.mapred; 52 | 53 | import java.io.DataInputStream; 54 | import java.io.IOException; 55 | 56 | import org.apache.hadoop.conf.Configuration; 57 | import org.apache.hadoop.fs.FileSystem; 58 | import org.apache.hadoop.fs.Path; 59 | import org.apache.hadoop.io.LongWritable; 60 | import org.apache.hadoop.io.compress.CompressionCodec; 61 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 62 | import org.apache.hadoop.mapred.FileInputFormat; 63 | import org.apache.hadoop.mapred.FileSplit; 64 | import org.apache.hadoop.mapred.InputSplit; 65 | import org.apache.hadoop.mapred.JobConf; 66 | import org.apache.hadoop.mapred.RecordReader; 67 | import org.apache.hadoop.mapred.Reporter; 68 | import org.clueweb.clueweb09.ClueWeb09WarcRecord; 69 | 70 | public class ClueWeb09InputFormat extends FileInputFormat { 71 | 72 | /** 73 | * Don't allow the files to be split! 74 | */ 75 | @Override 76 | protected boolean isSplitable(FileSystem fs, Path filename) { 77 | // ensure the input files are not splittable! 78 | return false; 79 | } 80 | 81 | /** 82 | * Just return the record reader 83 | */ 84 | public RecordReader getRecordReader(InputSplit split, JobConf conf, 85 | Reporter reporter) throws IOException { 86 | return new ClueWarcRecordReader(conf, (FileSplit) split); 87 | } 88 | 89 | public static class ClueWarcRecordReader implements RecordReader { 90 | private long recordCount = 1; 91 | private Path path = null; 92 | private DataInputStream input = null; 93 | 94 | private long totalNumBytesRead = 0; 95 | 96 | public ClueWarcRecordReader(Configuration conf, FileSplit split) throws IOException { 97 | FileSystem fs = FileSystem.get(conf); 98 | path = split.getPath(); 99 | 100 | CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); 101 | CompressionCodec compressionCodec = compressionCodecs.getCodec(path); 102 | input = new DataInputStream(compressionCodec.createInputStream(fs.open(path))); 103 | } 104 | 105 | @Override 106 | public boolean next(LongWritable key, ClueWeb09WarcRecord value) throws IOException { 107 | DataInputStream whichStream = input; 108 | 109 | ClueWeb09WarcRecord newRecord = ClueWeb09WarcRecord.readNextWarcRecord(whichStream); 110 | if (newRecord == null) { 111 | return false; 112 | } 113 | 114 | totalNumBytesRead += (long) newRecord.getTotalRecordLength(); 115 | newRecord.setWarcFilePath(path.toString()); 116 | 117 | value.set(newRecord); 118 | key.set(recordCount); 119 | 120 | recordCount++; 121 | return true; 122 | } 123 | 124 | @Override 125 | public LongWritable createKey() { 126 | return new LongWritable(); 127 | } 128 | 129 | @Override 130 | public ClueWeb09WarcRecord createValue() { 131 | return new ClueWeb09WarcRecord(); 132 | } 133 | 134 | @Override 135 | public long getPos() throws IOException { 136 | return totalNumBytesRead; 137 | } 138 | 139 | @Override 140 | public void close() throws IOException { 141 | input.close(); 142 | } 143 | 144 | @Override 145 | public float getProgress() throws IOException { 146 | return (float) recordCount / 40000f; 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb09/mapreduce/ClueWeb09InputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb09.mapreduce; 18 | 19 | import java.io.DataInputStream; 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.fs.FSDataInputStream; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.fs.Seekable; 27 | import org.apache.hadoop.io.LongWritable; 28 | import org.apache.hadoop.io.compress.CodecPool; 29 | import org.apache.hadoop.io.compress.CompressionCodec; 30 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 31 | import org.apache.hadoop.io.compress.Decompressor; 32 | import org.apache.hadoop.mapreduce.InputSplit; 33 | import org.apache.hadoop.mapreduce.JobContext; 34 | import org.apache.hadoop.mapreduce.RecordReader; 35 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 37 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 38 | import org.clueweb.clueweb09.ClueWeb09WarcRecord; 39 | 40 | public class ClueWeb09InputFormat extends FileInputFormat { 41 | @Override 42 | public RecordReader createRecordReader(InputSplit split, 43 | TaskAttemptContext context) throws IOException, InterruptedException { 44 | return new ClueWarcRecordReader(); 45 | } 46 | 47 | @Override 48 | protected boolean isSplitable(JobContext context, Path filename) { 49 | return false; 50 | } 51 | 52 | public class ClueWarcRecordReader extends RecordReader { 53 | private CompressionCodecFactory compressionCodecs = null; 54 | private long start; 55 | private long pos; 56 | private long end; 57 | private LongWritable key = null; 58 | private ClueWeb09WarcRecord value = null; 59 | private Seekable filePosition; 60 | private CompressionCodec codec; 61 | private Decompressor decompressor; 62 | private DataInputStream in; 63 | 64 | public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 65 | FileSplit split = (FileSplit) genericSplit; 66 | Configuration job = context.getConfiguration(); 67 | start = split.getStart(); 68 | end = start + split.getLength(); 69 | final Path file = split.getPath(); 70 | compressionCodecs = new CompressionCodecFactory(job); 71 | codec = compressionCodecs.getCodec(file); 72 | 73 | // open the file and seek to the start of the split 74 | FileSystem fs = file.getFileSystem(job); 75 | FSDataInputStream fileIn = fs.open(split.getPath()); 76 | 77 | if (isCompressedInput()) { 78 | in = new DataInputStream(codec.createInputStream(fileIn, decompressor)); 79 | filePosition = fileIn; 80 | } else { 81 | fileIn.seek(start); 82 | in = fileIn; 83 | filePosition = fileIn; 84 | } 85 | 86 | this.pos = start; 87 | } 88 | 89 | private boolean isCompressedInput() { 90 | return (codec != null); 91 | } 92 | 93 | private long getFilePosition() throws IOException { 94 | long retVal; 95 | if (isCompressedInput() && null != filePosition) { 96 | retVal = filePosition.getPos(); 97 | } else { 98 | retVal = pos; 99 | } 100 | return retVal; 101 | } 102 | 103 | public boolean nextKeyValue() throws IOException { 104 | if (key == null) { 105 | key = new LongWritable(); 106 | } 107 | key.set(pos); 108 | 109 | value = ClueWeb09WarcRecord.readNextWarcRecord(in); 110 | if (value == null) { 111 | return false; 112 | } 113 | return true; 114 | } 115 | 116 | @Override 117 | public LongWritable getCurrentKey() { 118 | return key; 119 | } 120 | 121 | @Override 122 | public ClueWeb09WarcRecord getCurrentValue() { 123 | return value; 124 | } 125 | 126 | /** 127 | * Get the progress within the split 128 | */ 129 | public float getProgress() throws IOException { 130 | if (start == end) { 131 | return 0.0f; 132 | } else { 133 | return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start)); 134 | } 135 | } 136 | 137 | public synchronized void close() throws IOException { 138 | try { 139 | if (in != null) { 140 | in.close(); 141 | } 142 | } finally { 143 | if (decompressor != null) { 144 | CodecPool.returnDecompressor(decompressor); 145 | } 146 | } 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction; 20 | import it.unimi.dsi.util.FrontCodedStringList; 21 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap; 22 | 23 | import java.io.ByteArrayOutputStream; 24 | import java.io.IOException; 25 | import java.io.ObjectOutputStream; 26 | import java.util.Arrays; 27 | import java.util.Iterator; 28 | import java.util.List; 29 | 30 | import org.apache.commons.cli.CommandLine; 31 | import org.apache.commons.cli.CommandLineParser; 32 | import org.apache.commons.cli.GnuParser; 33 | import org.apache.commons.cli.HelpFormatter; 34 | import org.apache.commons.cli.OptionBuilder; 35 | import org.apache.commons.cli.Options; 36 | import org.apache.commons.cli.ParseException; 37 | import org.apache.hadoop.conf.Configuration; 38 | import org.apache.hadoop.conf.Configured; 39 | import org.apache.hadoop.fs.FSDataOutputStream; 40 | import org.apache.hadoop.fs.FileSystem; 41 | import org.apache.hadoop.fs.Path; 42 | import org.apache.hadoop.io.NullWritable; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.io.WritableUtils; 45 | import org.apache.hadoop.mapreduce.Job; 46 | import org.apache.hadoop.mapreduce.Mapper; 47 | import org.apache.hadoop.mapreduce.Reducer; 48 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 49 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 50 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 51 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 52 | import org.apache.hadoop.util.Tool; 53 | import org.apache.hadoop.util.ToolRunner; 54 | import org.apache.log4j.Logger; 55 | import org.clueweb.dictionary.DictionaryTransformationStrategy; 56 | import org.clueweb.util.QuickSort; 57 | 58 | import tl.lin.data.pair.PairOfIntLong; 59 | 60 | import com.google.common.collect.Lists; 61 | 62 | public class BuildDictionary extends Configured implements Tool { 63 | private static final Logger LOG = Logger.getLogger(BuildDictionary.class); 64 | 65 | private static final String HADOOP_OUTPUT_OPTION = "dictionary.path"; 66 | private static final String HADOOP_TERMS_COUNT_OPTION = "terms.count"; 67 | 68 | protected static enum Terms { Total } 69 | 70 | public static final String TERMS_DATA = "dictionary.terms"; 71 | public static final String TERMS_ID_DATA = "dictionary.ids"; 72 | public static final String TERMS_ID_MAPPING_DATA = "dictionary.mapping"; 73 | 74 | public static final String DF_BY_TERM_DATA = "df.terms"; 75 | public static final String DF_BY_ID_DATA = "df.ids"; 76 | 77 | public static final String CF_BY_TERM_DATA = "cf.terms"; 78 | public static final String CF_BY_ID_DATA = "cf.ids"; 79 | 80 | private static class MyReducer 81 | extends Reducer { 82 | private FSDataOutputStream termsOut, idsOut, idsToTermOut, 83 | dfByTermOut, cfByTermOut, dfByIntOut, cfByIntOut; 84 | private int numTerms; 85 | private int[] seqNums = null; 86 | private int[] dfs = null; 87 | private long[] cfs = null; 88 | private int curKeyIndex = 0; 89 | 90 | private String[] terms; 91 | 92 | @Override 93 | public void setup(Reducer.Context context) 94 | throws IOException { 95 | LOG.info("Starting setup."); 96 | Configuration conf = context.getConfiguration(); 97 | FileSystem fs = FileSystem.get(conf); 98 | 99 | numTerms = conf.getInt(HADOOP_TERMS_COUNT_OPTION, 0); 100 | LOG.info(HADOOP_TERMS_COUNT_OPTION + ": " + numTerms); 101 | String basePath = conf.get(HADOOP_OUTPUT_OPTION); 102 | LOG.info(HADOOP_OUTPUT_OPTION + ": " + basePath); 103 | 104 | terms = new String[numTerms]; 105 | seqNums = new int[numTerms]; 106 | dfs = new int[numTerms]; 107 | cfs = new long[numTerms]; 108 | 109 | termsOut = fs.create(new Path(basePath, TERMS_DATA), true); 110 | 111 | idsOut = fs.create(new Path(basePath, TERMS_ID_DATA), true); 112 | idsOut.writeInt(numTerms); 113 | 114 | idsToTermOut = fs.create(new Path(basePath, TERMS_ID_MAPPING_DATA), true); 115 | idsToTermOut.writeInt(numTerms); 116 | 117 | dfByTermOut = fs.create(new Path(basePath, DF_BY_TERM_DATA), true); 118 | dfByTermOut.writeInt(numTerms); 119 | 120 | cfByTermOut = fs.create(new Path(basePath, CF_BY_TERM_DATA), true); 121 | cfByTermOut.writeInt(numTerms); 122 | 123 | dfByIntOut = fs.create(new Path(basePath, DF_BY_ID_DATA), true); 124 | dfByIntOut.writeInt(numTerms); 125 | 126 | cfByIntOut = fs.create(new Path(basePath, CF_BY_ID_DATA), true); 127 | cfByIntOut.writeInt(numTerms); 128 | LOG.info("Finished setup."); 129 | } 130 | 131 | @Override 132 | public void reduce(Text key, Iterable values, Context context) 133 | throws IOException, InterruptedException { 134 | String term = key.toString(); 135 | Iterator iter = values.iterator(); 136 | PairOfIntLong p = iter.next(); 137 | int df = p.getLeftElement(); 138 | long cf = p.getRightElement(); 139 | WritableUtils.writeVInt(dfByTermOut, df); 140 | WritableUtils.writeVLong(cfByTermOut, cf); 141 | 142 | if (iter.hasNext()) { 143 | throw new RuntimeException("More than one record for term: " + term); 144 | } 145 | 146 | terms[curKeyIndex] = term; 147 | seqNums[curKeyIndex] = curKeyIndex; 148 | dfs[curKeyIndex] = -df; 149 | cfs[curKeyIndex] = cf; 150 | curKeyIndex++; 151 | 152 | context.getCounter(Terms.Total).increment(1); 153 | } 154 | 155 | @Override 156 | public void cleanup( 157 | Reducer.Context context) 158 | throws IOException { 159 | LOG.info("Starting cleanup."); 160 | if (curKeyIndex != numTerms) { 161 | throw new RuntimeException("Total expected Terms: " + numTerms + 162 | ", Total observed terms: " + curKeyIndex + "!"); 163 | } 164 | // Sort based on df and change seqNums accordingly. 165 | QuickSort.quicksortWithSecondary(seqNums, dfs, cfs, 0, numTerms - 1); 166 | 167 | // Write sorted dfs and cfs by int here. 168 | for (int i = 0; i < numTerms; i++) { 169 | WritableUtils.writeVInt(dfByIntOut, -dfs[i]); 170 | WritableUtils.writeVLong(cfByIntOut, cfs[i]); 171 | } 172 | cfs = null; 173 | 174 | // Encode the sorted dfs into ids ==> df values erased and become ids instead. Note that first 175 | // term id is 1. 176 | for (int i = 0; i < numTerms; i++) { 177 | dfs[i] = i + 1; 178 | } 179 | 180 | // Write current seq nums to be index into the term array. 181 | for (int i = 0; i < numTerms; i++) 182 | idsToTermOut.writeInt(seqNums[i]); 183 | 184 | // Sort on seqNums to get the right writing order. 185 | QuickSort.quicksort(dfs, seqNums, 0, numTerms - 1); 186 | for (int i = 0; i < numTerms; i++) { 187 | idsOut.writeInt(dfs[i]); 188 | } 189 | 190 | ByteArrayOutputStream bytesOut; 191 | ObjectOutputStream objOut; 192 | byte[] bytes; 193 | 194 | List termList = Lists.newArrayList(terms); 195 | FrontCodedStringList frontcodedList = new FrontCodedStringList(termList, 8, true); 196 | 197 | bytesOut = new ByteArrayOutputStream(); 198 | objOut = new ObjectOutputStream(bytesOut); 199 | objOut.writeObject(frontcodedList); 200 | objOut.close(); 201 | 202 | bytes = bytesOut.toByteArray(); 203 | termsOut.writeInt(bytes.length); 204 | termsOut.write(bytes); 205 | 206 | ShiftAddXorSignedStringMap dict = new ShiftAddXorSignedStringMap(termList.iterator(), 207 | new TwoStepsLcpMonotoneMinimalPerfectHashFunction(termList, 208 | DictionaryTransformationStrategy.getStrategy())); 209 | 210 | bytesOut = new ByteArrayOutputStream(); 211 | objOut = new ObjectOutputStream(bytesOut); 212 | objOut.writeObject(dict); 213 | objOut.close(); 214 | 215 | bytes = bytesOut.toByteArray(); 216 | termsOut.writeInt(bytes.length); 217 | termsOut.write(bytes); 218 | 219 | termsOut.close(); 220 | idsOut.close(); 221 | idsToTermOut.close(); 222 | dfByTermOut.close(); 223 | cfByTermOut.close(); 224 | dfByIntOut.close(); 225 | cfByIntOut.close(); 226 | LOG.info("Finished cleanup."); 227 | } 228 | } 229 | 230 | public static final String INPUT_OPTION = "input"; 231 | public static final String OUTPUT_OPTION = "output"; 232 | public static final String COUNT_OPTION = "count"; 233 | 234 | /** 235 | * Runs this tool. 236 | */ 237 | @SuppressWarnings("static-access") 238 | public int run(String[] args) throws Exception { 239 | Options options = new Options(); 240 | 241 | options.addOption(OptionBuilder.withArgName("path").hasArg() 242 | .withDescription("input path").create(INPUT_OPTION)); 243 | options.addOption(OptionBuilder.withArgName("path").hasArg() 244 | .withDescription("output path").create(OUTPUT_OPTION)); 245 | options.addOption(OptionBuilder.withArgName("num").hasArg() 246 | .withDescription("number of terms").create(COUNT_OPTION)); 247 | 248 | CommandLine cmdline; 249 | CommandLineParser parser = new GnuParser(); 250 | try { 251 | cmdline = parser.parse(options, args); 252 | } catch (ParseException exp) { 253 | HelpFormatter formatter = new HelpFormatter(); 254 | formatter.printHelp(this.getClass().getName(), options); 255 | ToolRunner.printGenericCommandUsage(System.out); 256 | System.err.println("Error parsing command line: " + exp.getMessage()); 257 | return -1; 258 | } 259 | 260 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || 261 | !cmdline.hasOption(COUNT_OPTION)) { 262 | HelpFormatter formatter = new HelpFormatter(); 263 | formatter.printHelp(this.getClass().getName(), options); 264 | ToolRunner.printGenericCommandUsage(System.out); 265 | return -1; 266 | } 267 | 268 | String input = cmdline.getOptionValue(INPUT_OPTION); 269 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 270 | 271 | LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); 272 | LOG.info(" - input: " + input); 273 | LOG.info(" - output: " + output); 274 | 275 | Configuration conf = getConf(); 276 | 277 | conf.set(HADOOP_OUTPUT_OPTION, output); 278 | conf.setInt(HADOOP_TERMS_COUNT_OPTION, 279 | Integer.parseInt(cmdline.getOptionValue(COUNT_OPTION))); 280 | conf.set("mapreduce.map.memory.mb", "2048"); 281 | conf.set("mapreduce.map.java.opts", "-Xmx2048m"); 282 | conf.set("mapreduce.reduce.memory.mb", "2048"); 283 | conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); 284 | 285 | Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + input); 286 | 287 | job.setJarByClass(BuildDictionary.class); 288 | job.setNumReduceTasks(1); 289 | 290 | FileInputFormat.setInputPaths(job, new Path(input)); 291 | FileOutputFormat.setOutputPath(job, new Path(output)); 292 | 293 | job.setInputFormatClass(SequenceFileInputFormat.class); 294 | job.setOutputFormatClass(NullOutputFormat.class); 295 | 296 | job.setMapOutputKeyClass(Text.class); 297 | job.setMapOutputValueClass(PairOfIntLong.class); 298 | job.setOutputKeyClass(Text.class); 299 | job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class); 300 | 301 | job.setMapperClass(Mapper.class); 302 | job.setReducerClass(MyReducer.class); 303 | 304 | FileSystem.get(getConf()).delete(new Path(output), true); 305 | long startTime = System.currentTimeMillis(); 306 | job.waitForCompletion(true); 307 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 308 | 309 | return 0; 310 | } 311 | 312 | /** 313 | * Dispatches command-line arguments to the tool via the ToolRunner. 314 | */ 315 | public static void main(String[] args) throws Exception { 316 | LOG.info("Running " + BuildDictionary.class.getCanonicalName() + " with args " 317 | + Arrays.toString(args)); 318 | ToolRunner.run(new BuildDictionary(), args); 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildPForDocVectors.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.LongWritable; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.mapreduce.Job; 36 | import org.apache.hadoop.mapreduce.Mapper; 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 40 | import org.apache.hadoop.util.Tool; 41 | import org.apache.hadoop.util.ToolRunner; 42 | import org.apache.log4j.Logger; 43 | import org.apache.lucene.analysis.Analyzer; 44 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 45 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat; 46 | import org.clueweb.data.PForDocVector; 47 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary; 48 | import org.clueweb.util.AnalyzerFactory; 49 | import org.jsoup.Jsoup; 50 | 51 | import tl.lin.data.array.IntArrayWritable; 52 | import tl.lin.lucene.AnalyzerUtils; 53 | 54 | public class BuildPForDocVectors extends Configured implements Tool { 55 | private static final Logger LOG = Logger.getLogger(BuildPForDocVectors.class); 56 | 57 | private static enum Records { 58 | TOTAL, PAGES, ERRORS, TOO_LONG 59 | }; 60 | 61 | private static Analyzer ANALYZER; 62 | 63 | private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this. 64 | 65 | private static class MyMapper extends 66 | Mapper { 67 | private static final Text DOCID = new Text(); 68 | private static final IntArrayWritable DOC = new IntArrayWritable(); 69 | 70 | private DefaultFrequencySortedDictionary dictionary; 71 | 72 | @Override 73 | public void setup(Context context) throws IOException { 74 | FileSystem fs = FileSystem.get(context.getConfiguration()); 75 | String path = context.getConfiguration().get(DICTIONARY_OPTION); 76 | dictionary = new DefaultFrequencySortedDictionary(path, fs); 77 | 78 | String analyzerType = context.getConfiguration().get(PREPROCESSING); 79 | ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType); 80 | if (ANALYZER == null) { 81 | LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName()); 82 | System.exit(1); 83 | } 84 | } 85 | 86 | @Override 87 | public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) 88 | throws IOException, InterruptedException { 89 | context.getCounter(Records.TOTAL).increment(1); 90 | 91 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 92 | if (docid != null) { 93 | DOCID.set(docid); 94 | 95 | context.getCounter(Records.PAGES).increment(1); 96 | try { 97 | String content = doc.getContent(); 98 | 99 | // If the document is excessively long, it usually means that something is wrong (e.g., a 100 | // binary object). Skip so the parsing doesn't choke. 101 | // As an alternative, we might want to consider putting in a timeout, e.g., 102 | // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread 103 | if (content.length() > MAX_DOC_LENGTH) { 104 | LOG.info("Skipping " + docid + " due to excessive length: " + content.length()); 105 | context.getCounter(Records.TOO_LONG).increment(1); 106 | PForDocVector.toIntArrayWritable(DOC, new int[] {}, 0); 107 | context.write(DOCID, DOC); 108 | return; 109 | } 110 | 111 | String cleaned = Jsoup.parse(content).text(); 112 | List tokens = AnalyzerUtils.parse(ANALYZER, cleaned); 113 | 114 | int len = 0; 115 | int[] termids = new int[tokens.size()]; 116 | for (String token : tokens) { 117 | int id = dictionary.getId(token); 118 | if (id != -1) { 119 | termids[len] = id; 120 | len++; 121 | } 122 | } 123 | 124 | PForDocVector.toIntArrayWritable(DOC, termids, len); 125 | context.write(DOCID, DOC); 126 | } catch (Exception e) { 127 | // If Jsoup throws any exceptions, catch and move on, but emit empty doc. 128 | LOG.info("Error caught processing " + docid); 129 | DOC.setArray(new int[] {}); // Clean up possible corrupted data 130 | context.getCounter(Records.ERRORS).increment(1); 131 | PForDocVector.toIntArrayWritable(DOC, new int[] {}, 0); 132 | context.write(DOCID, DOC); 133 | } 134 | } 135 | } 136 | } 137 | 138 | public static final String INPUT_OPTION = "input"; 139 | public static final String OUTPUT_OPTION = "output"; 140 | public static final String DICTIONARY_OPTION = "dictionary"; 141 | public static final String REDUCERS_OPTION = "reducers"; 142 | public static final String PREPROCESSING = "preprocessing"; 143 | 144 | /** 145 | * Runs this tool. 146 | */ 147 | @SuppressWarnings("static-access") 148 | public int run(String[] args) throws Exception { 149 | Options options = new Options(); 150 | 151 | options.addOption(OptionBuilder.withArgName("path").hasArg() 152 | .withDescription("input path").create(INPUT_OPTION)); 153 | options.addOption(OptionBuilder.withArgName("path").hasArg() 154 | .withDescription("output path").create(OUTPUT_OPTION)); 155 | options.addOption(OptionBuilder.withArgName("path").hasArg() 156 | .withDescription("dictionary").create(DICTIONARY_OPTION)); 157 | options.addOption(OptionBuilder.withArgName("num").hasArg() 158 | .withDescription("number of reducers").create(REDUCERS_OPTION)); 159 | options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() 160 | .withDescription("preprocessing").create(PREPROCESSING)); 161 | 162 | CommandLine cmdline; 163 | CommandLineParser parser = new GnuParser(); 164 | try { 165 | cmdline = parser.parse(options, args); 166 | } catch (ParseException exp) { 167 | HelpFormatter formatter = new HelpFormatter(); 168 | formatter.printHelp(this.getClass().getName(), options); 169 | ToolRunner.printGenericCommandUsage(System.out); 170 | System.err.println("Error parsing command line: " + exp.getMessage()); 171 | return -1; 172 | } 173 | 174 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) 175 | || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(PREPROCESSING)) { 176 | HelpFormatter formatter = new HelpFormatter(); 177 | formatter.printHelp(this.getClass().getName(), options); 178 | ToolRunner.printGenericCommandUsage(System.out); 179 | return -1; 180 | } 181 | 182 | String input = cmdline.getOptionValue(INPUT_OPTION); 183 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 184 | String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); 185 | String preprocessing = cmdline.getOptionValue(PREPROCESSING); 186 | 187 | Job job = new Job(getConf(), BuildPForDocVectors.class.getSimpleName() + ":" + input); 188 | job.setJarByClass(BuildPForDocVectors.class); 189 | 190 | LOG.info("Tool name: " + BuildPForDocVectors.class.getSimpleName()); 191 | LOG.info(" - input: " + input); 192 | LOG.info(" - output: " + output); 193 | LOG.info(" - dictionary: " + dictionary); 194 | LOG.info(" - preprocessing: " + preprocessing); 195 | 196 | if (cmdline.hasOption(REDUCERS_OPTION)) { 197 | int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION)); 198 | LOG.info(" - reducers: " + numReducers); 199 | job.setNumReduceTasks(numReducers); 200 | } else { 201 | job.setNumReduceTasks(0); 202 | } 203 | 204 | FileInputFormat.setInputPaths(job, input); 205 | FileOutputFormat.setOutputPath(job, new Path(output)); 206 | 207 | job.getConfiguration().set(DICTIONARY_OPTION, dictionary); 208 | job.getConfiguration().set(PREPROCESSING, preprocessing); 209 | 210 | job.setInputFormatClass(ClueWeb12InputFormat.class); 211 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 212 | 213 | job.setMapOutputKeyClass(Text.class); 214 | job.setMapOutputValueClass(IntArrayWritable.class); 215 | job.setOutputKeyClass(Text.class); 216 | job.setOutputValueClass(IntArrayWritable.class); 217 | 218 | job.setMapperClass(MyMapper.class); 219 | 220 | FileSystem.get(getConf()).delete(new Path(output), true); 221 | 222 | long startTime = System.currentTimeMillis(); 223 | job.waitForCompletion(true); 224 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 225 | 226 | return 0; 227 | } 228 | 229 | /** 230 | * Dispatches command-line arguments to the tool via the ToolRunner. 231 | */ 232 | public static void main(String[] args) throws Exception { 233 | LOG.info("Running " + BuildPForDocVectors.class.getCanonicalName() + " with args " 234 | + Arrays.toString(args)); 235 | ToolRunner.run(new BuildPForDocVectors(), args); 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildVByteDocVectors.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.BytesWritable; 34 | import org.apache.hadoop.io.LongWritable; 35 | import org.apache.hadoop.io.Text; 36 | import org.apache.hadoop.mapreduce.Job; 37 | import org.apache.hadoop.mapreduce.Mapper; 38 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 40 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 41 | import org.apache.hadoop.util.Tool; 42 | import org.apache.hadoop.util.ToolRunner; 43 | import org.apache.log4j.Logger; 44 | import org.apache.lucene.analysis.Analyzer; 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 46 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat; 47 | import org.clueweb.data.VByteDocVector; 48 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary; 49 | import org.clueweb.util.AnalyzerFactory; 50 | import org.jsoup.Jsoup; 51 | 52 | import tl.lin.lucene.AnalyzerUtils; 53 | 54 | public class BuildVByteDocVectors extends Configured implements Tool { 55 | private static final Logger LOG = Logger.getLogger(BuildVByteDocVectors.class); 56 | 57 | private static enum Records { 58 | TOTAL, PAGES, ERRORS, TOO_LONG 59 | }; 60 | 61 | private static Analyzer ANALYZER; 62 | 63 | private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this. 64 | 65 | private static class MyMapper extends 66 | Mapper { 67 | private static final Text DOCID = new Text(); 68 | private static final BytesWritable DOC = new BytesWritable(); 69 | 70 | private DefaultFrequencySortedDictionary dictionary; 71 | 72 | @Override 73 | public void setup(Context context) throws IOException { 74 | FileSystem fs = FileSystem.get(context.getConfiguration()); 75 | String path = context.getConfiguration().get(DICTIONARY_OPTION); 76 | dictionary = new DefaultFrequencySortedDictionary(path, fs); 77 | 78 | String analyzerType = context.getConfiguration().get(PREPROCESSING); 79 | ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType); 80 | if (ANALYZER == null) { 81 | LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName()); 82 | System.exit(1); 83 | } 84 | } 85 | 86 | @Override 87 | public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) 88 | throws IOException, InterruptedException { 89 | context.getCounter(Records.TOTAL).increment(1); 90 | 91 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 92 | if (docid != null) { 93 | DOCID.set(docid); 94 | 95 | context.getCounter(Records.PAGES).increment(1); 96 | try { 97 | String content = doc.getContent(); 98 | 99 | // If the document is excessively long, it usually means that something is wrong (e.g., a 100 | // binary object). Skip so the parsing doesn't choke. 101 | // As an alternative, we might want to consider putting in a timeout, e.g., 102 | // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread 103 | if (content.length() > MAX_DOC_LENGTH) { 104 | DOC.set(new byte[] {}, 0, 0); // Clean up possible corrupted data 105 | context.getCounter(Records.TOO_LONG).increment(1); 106 | VByteDocVector.toBytesWritable(DOC, new int[] {}, 0); 107 | context.write(DOCID, DOC); 108 | return; 109 | } 110 | 111 | String cleaned = Jsoup.parse(content).text(); 112 | List tokens = AnalyzerUtils.parse(ANALYZER, cleaned); 113 | 114 | int len = 0; 115 | int[] termids = new int[tokens.size()]; 116 | for (String token : tokens) { 117 | int id = dictionary.getId(token); 118 | if (id != -1) { 119 | termids[len] = id; 120 | len++; 121 | } 122 | } 123 | 124 | VByteDocVector.toBytesWritable(DOC, termids, len); 125 | context.write(DOCID, DOC); 126 | } catch (Exception e) { 127 | // If Jsoup throws any exceptions, catch and move on, but emit empty doc. 128 | LOG.info("Error caught processing " + docid); 129 | context.getCounter(Records.ERRORS).increment(1); 130 | VByteDocVector.toBytesWritable(DOC, new int[] {}, 0); 131 | context.write(DOCID, DOC); 132 | } 133 | } 134 | } 135 | } 136 | 137 | public static final String INPUT_OPTION = "input"; 138 | public static final String OUTPUT_OPTION = "output"; 139 | public static final String DICTIONARY_OPTION = "dictionary"; 140 | public static final String REDUCERS_OPTION = "reducers"; 141 | public static final String PREPROCESSING = "preprocessing"; 142 | 143 | /** 144 | * Runs this tool. 145 | */ 146 | @SuppressWarnings("static-access") 147 | public int run(String[] args) throws Exception { 148 | Options options = new Options(); 149 | 150 | options.addOption(OptionBuilder.withArgName("path").hasArg() 151 | .withDescription("input path").create(INPUT_OPTION)); 152 | options.addOption(OptionBuilder.withArgName("path").hasArg() 153 | .withDescription("output path").create(OUTPUT_OPTION)); 154 | options.addOption(OptionBuilder.withArgName("path").hasArg() 155 | .withDescription("dictionary").create(DICTIONARY_OPTION)); 156 | options.addOption(OptionBuilder.withArgName("num").hasArg() 157 | .withDescription("number of reducers").create(REDUCERS_OPTION)); 158 | options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() 159 | .withDescription("preprocessing").create(PREPROCESSING)); 160 | 161 | CommandLine cmdline; 162 | CommandLineParser parser = new GnuParser(); 163 | try { 164 | cmdline = parser.parse(options, args); 165 | } catch (ParseException exp) { 166 | HelpFormatter formatter = new HelpFormatter(); 167 | formatter.printHelp(this.getClass().getName(), options); 168 | ToolRunner.printGenericCommandUsage(System.out); 169 | System.err.println("Error parsing command line: " + exp.getMessage()); 170 | return -1; 171 | } 172 | 173 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) 174 | || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(PREPROCESSING)) { 175 | HelpFormatter formatter = new HelpFormatter(); 176 | formatter.printHelp(this.getClass().getName(), options); 177 | ToolRunner.printGenericCommandUsage(System.out); 178 | return -1; 179 | } 180 | 181 | String input = cmdline.getOptionValue(INPUT_OPTION); 182 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 183 | String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); 184 | String preprocessing = cmdline.getOptionValue(PREPROCESSING); 185 | 186 | Job job = new Job(getConf(), BuildVByteDocVectors.class.getSimpleName() + ":" + input); 187 | job.setJarByClass(BuildVByteDocVectors.class); 188 | 189 | LOG.info("Tool name: " + BuildVByteDocVectors.class.getSimpleName()); 190 | LOG.info(" - input: " + input); 191 | LOG.info(" - output: " + output); 192 | LOG.info(" - dictionary: " + dictionary); 193 | LOG.info(" - preprocessing: " + preprocessing); 194 | 195 | if (cmdline.hasOption(REDUCERS_OPTION)) { 196 | int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION)); 197 | LOG.info(" - reducers: " + numReducers); 198 | job.setNumReduceTasks(numReducers); 199 | } else { 200 | job.setNumReduceTasks(0); 201 | } 202 | 203 | FileInputFormat.setInputPaths(job, input); 204 | FileOutputFormat.setOutputPath(job, new Path(output)); 205 | 206 | job.getConfiguration().set(DICTIONARY_OPTION, dictionary); 207 | job.getConfiguration().set(PREPROCESSING, preprocessing); 208 | 209 | job.setInputFormatClass(ClueWeb12InputFormat.class); 210 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 211 | 212 | job.setMapOutputKeyClass(Text.class); 213 | job.setMapOutputValueClass(BytesWritable.class); 214 | job.setOutputKeyClass(Text.class); 215 | job.setOutputValueClass(BytesWritable.class); 216 | 217 | job.setMapperClass(MyMapper.class); 218 | 219 | FileSystem.get(getConf()).delete(new Path(output), true); 220 | 221 | long startTime = System.currentTimeMillis(); 222 | job.waitForCompletion(true); 223 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 224 | 225 | return 0; 226 | } 227 | 228 | /** 229 | * Dispatches command-line arguments to the tool via the ToolRunner. 230 | */ 231 | public static void main(String[] args) throws Exception { 232 | LOG.info("Running " + BuildVByteDocVectors.class.getCanonicalName() + " with args " 233 | + Arrays.toString(args)); 234 | ToolRunner.run(new BuildVByteDocVectors(), args); 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/BuildWarcTrecIdMapping.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.File; 21 | import java.io.FileInputStream; 22 | import java.io.IOException; 23 | import java.io.InputStreamReader; 24 | import java.io.PrintStream; 25 | import java.util.concurrent.ExecutorService; 26 | import java.util.concurrent.Executors; 27 | 28 | import org.apache.commons.cli.CommandLine; 29 | import org.apache.commons.cli.CommandLineParser; 30 | import org.apache.commons.cli.GnuParser; 31 | import org.apache.commons.cli.HelpFormatter; 32 | import org.apache.commons.cli.Option; 33 | import org.apache.commons.cli.OptionBuilder; 34 | import org.apache.commons.cli.Options; 35 | import org.apache.commons.cli.ParseException; 36 | import org.apache.log4j.Logger; 37 | import org.apache.lucene.analysis.Analyzer; 38 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 39 | import org.apache.lucene.document.Document; 40 | import org.apache.lucene.document.Field; 41 | import org.apache.lucene.document.FieldType; 42 | import org.apache.lucene.index.FieldInfo.IndexOptions; 43 | import org.apache.lucene.index.IndexWriter; 44 | import org.apache.lucene.index.IndexWriterConfig; 45 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 46 | import org.apache.lucene.store.Directory; 47 | import org.apache.lucene.store.FSDirectory; 48 | import org.apache.lucene.util.Version; 49 | import org.apache.tools.bzip2.CBZip2InputStream; 50 | import org.clueweb.data.WarcTrecIdMapping; 51 | 52 | public class BuildWarcTrecIdMapping { 53 | private static final Logger LOG = Logger.getLogger(BuildWarcTrecIdMapping.class); 54 | 55 | public static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43); 56 | 57 | static final FieldType FIELD_OPTIONS = new FieldType(); 58 | 59 | static { 60 | FIELD_OPTIONS.setIndexed(true); 61 | FIELD_OPTIONS.setIndexOptions(IndexOptions.DOCS_ONLY); 62 | FIELD_OPTIONS.setStored(true); 63 | FIELD_OPTIONS.setTokenized(false); 64 | } 65 | 66 | private static final int DEFAULT_NUM_THREADS = 4; 67 | 68 | private static final String INPUT_OPTION = "input"; 69 | private static final String INDEX_OPTION = "index"; 70 | private static final String MAX_OPTION = "max"; 71 | private static final String OPTIMIZE_OPTION = "optimize"; 72 | private static final String THREADS_OPTION = "threads"; 73 | 74 | @SuppressWarnings("static-access") 75 | public static void main(String[] args) throws Exception { 76 | Options options = new Options(); 77 | options.addOption(OptionBuilder.withArgName("path").hasArg() 78 | .withDescription("bz2 Wikipedia XML dump file").create(INPUT_OPTION)); 79 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 80 | .withDescription("index location").create(INDEX_OPTION)); 81 | options.addOption(OptionBuilder.withArgName("num").hasArg() 82 | .withDescription("maximum number of documents to index").create(MAX_OPTION)); 83 | options.addOption(OptionBuilder.withArgName("num").hasArg() 84 | .withDescription("number of indexing threads").create(THREADS_OPTION)); 85 | 86 | options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); 87 | 88 | CommandLine cmdline = null; 89 | CommandLineParser parser = new GnuParser(); 90 | try { 91 | cmdline = parser.parse(options, args); 92 | } catch (ParseException exp) { 93 | System.err.println("Error parsing command line: " + exp.getMessage()); 94 | System.exit(-1); 95 | } 96 | 97 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { 98 | HelpFormatter formatter = new HelpFormatter(); 99 | formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options); 100 | System.exit(-1); 101 | } 102 | 103 | String indexPath = cmdline.getOptionValue(INDEX_OPTION); 104 | int maxdocs = cmdline.hasOption(MAX_OPTION) ? 105 | Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; 106 | int threads = cmdline.hasOption(THREADS_OPTION) ? 107 | Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; 108 | 109 | long startTime = System.currentTimeMillis(); 110 | 111 | String path = cmdline.getOptionValue(INPUT_OPTION); 112 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 113 | 114 | Directory dir = FSDirectory.open(new File(indexPath)); 115 | IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); 116 | config.setOpenMode(OpenMode.CREATE); 117 | 118 | IndexWriter writer = new IndexWriter(dir, config); 119 | LOG.info("Creating index at " + indexPath); 120 | LOG.info("Indexing with " + threads + " threads"); 121 | 122 | FileInputStream fis = null; 123 | BufferedReader br = null; 124 | 125 | try { 126 | fis = new FileInputStream(new File(path)); 127 | byte[] ignoreBytes = new byte[2]; 128 | fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools 129 | br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8")); 130 | 131 | ExecutorService executor = Executors.newFixedThreadPool(threads); 132 | int cnt = 0; 133 | String s; 134 | while ((s = br.readLine()) != null) { 135 | Runnable worker = new AddDocumentRunnable(writer, s); 136 | executor.execute(worker); 137 | 138 | cnt++; 139 | if (cnt % 1000000 == 0) { 140 | LOG.info(cnt + " articles added"); 141 | } 142 | if (cnt >= maxdocs) { 143 | break; 144 | } 145 | } 146 | 147 | executor.shutdown(); 148 | // Wait until all threads are finish 149 | while (!executor.isTerminated()) {} 150 | 151 | LOG.info("Total of " + cnt + " articles indexed."); 152 | 153 | if (cmdline.hasOption(OPTIMIZE_OPTION)) { 154 | LOG.info("Merging segments..."); 155 | writer.forceMerge(1); 156 | LOG.info("Done!"); 157 | } 158 | 159 | LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); 160 | } catch (Exception e) { 161 | e.printStackTrace(); 162 | } finally { 163 | writer.close(); 164 | dir.close(); 165 | out.close(); 166 | br.close(); 167 | fis.close(); 168 | } 169 | } 170 | 171 | private static class AddDocumentRunnable implements Runnable { 172 | private final IndexWriter writer; 173 | private final String s; 174 | 175 | AddDocumentRunnable(IndexWriter writer, String s) { 176 | this.writer = writer; 177 | this.s = s.split(",")[0]; 178 | } 179 | 180 | @Override 181 | public void run() { 182 | Document doc = new Document(); 183 | doc.add(new Field(WarcTrecIdMapping.IndexField.WARC_TREC_ID.name, s, FIELD_OPTIONS)); 184 | 185 | try { 186 | writer.addDocument(doc); 187 | } catch (IOException e) { 188 | e.printStackTrace(); 189 | } 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ComputeTermStatistics.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.Map; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.LongWritable; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.mapreduce.Job; 36 | import org.apache.hadoop.mapreduce.Mapper; 37 | import org.apache.hadoop.mapreduce.Reducer; 38 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 40 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 41 | import org.apache.hadoop.util.Tool; 42 | import org.apache.hadoop.util.ToolRunner; 43 | import org.apache.log4j.Logger; 44 | import org.apache.lucene.analysis.Analyzer; 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 46 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat; 47 | import org.clueweb.util.AnalyzerFactory; 48 | import org.jsoup.Jsoup; 49 | 50 | import tl.lin.data.pair.PairOfIntLong; 51 | import tl.lin.lucene.AnalyzerUtils; 52 | 53 | import com.google.common.collect.Maps; 54 | 55 | public class ComputeTermStatistics extends Configured implements Tool { 56 | private static final Logger LOG = Logger.getLogger(ComputeTermStatistics.class); 57 | 58 | private static enum Records { 59 | TOTAL, PAGES, ERRORS, SKIPPED 60 | }; 61 | 62 | private static Analyzer ANALYZER; 63 | 64 | private static final String HADOOP_DF_MIN_OPTION = "df.min"; 65 | private static final String HADOOP_DF_MAX_OPTION = "df.max"; 66 | 67 | private static final int MAX_TOKEN_LENGTH = 64; // Throw away tokens longer than this. 68 | private static final int MIN_DF_DEFAULT = 100; // Throw away terms with df less than this. 69 | private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this. 70 | 71 | private static class MyMapper extends 72 | Mapper { 73 | private static final Text term = new Text(); 74 | private static final PairOfIntLong pair = new PairOfIntLong(); 75 | 76 | @Override 77 | public void setup(Context context) throws IOException { 78 | 79 | String analyzerType = context.getConfiguration().get(PREPROCESSING); 80 | ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType); 81 | if (ANALYZER == null) { 82 | LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName()); 83 | System.exit(1); 84 | } 85 | } 86 | 87 | @Override 88 | public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) throws IOException, 89 | InterruptedException { 90 | 91 | context.getCounter(Records.TOTAL).increment(1); 92 | 93 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 94 | if (docid != null) { 95 | context.getCounter(Records.PAGES).increment(1); 96 | try { 97 | String content = doc.getContent(); 98 | 99 | // If the document is excessively long, it usually means that something is wrong (e.g., a 100 | // binary object). Skip so the parsing doesn't choke. 101 | // As an alternative, we might want to consider putting in a timeout, e.g., 102 | // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread 103 | if (content.length() > MAX_DOC_LENGTH) { 104 | LOG.info("Skipping " + docid + " due to excessive length: " + content.length()); 105 | context.getCounter(Records.SKIPPED).increment(1); 106 | return; 107 | } 108 | 109 | String cleaned = Jsoup.parse(content).text(); 110 | Map map = Maps.newHashMap(); 111 | for (String term : AnalyzerUtils.parse(ANALYZER, cleaned)) { 112 | if (term.length() > MAX_TOKEN_LENGTH) { 113 | continue; 114 | } 115 | 116 | if (map.containsKey(term)) { 117 | map.put(term, map.get(term) + 1); 118 | } else { 119 | map.put(term, 1); 120 | } 121 | } 122 | 123 | for (Map.Entry entry : map.entrySet()) { 124 | term.set(entry.getKey()); 125 | pair.set(1, entry.getValue()); 126 | context.write(term, pair); 127 | } 128 | } catch (Exception e) { 129 | // If Jsoup throws any exceptions, catch and move on. 130 | LOG.info("Error caught processing " + docid); 131 | context.getCounter(Records.ERRORS).increment(1); 132 | } 133 | } 134 | } 135 | } 136 | 137 | private static class MyCombiner extends Reducer { 138 | private static final PairOfIntLong output = new PairOfIntLong(); 139 | 140 | @Override 141 | public void reduce(Text key, Iterable values, Context context) 142 | throws IOException, InterruptedException { 143 | int df = 0; 144 | long cf = 0; 145 | for (PairOfIntLong pair : values) { 146 | df += pair.getLeftElement(); 147 | cf += pair.getRightElement(); 148 | } 149 | 150 | output.set(df, cf); 151 | context.write(key, output); 152 | } 153 | } 154 | 155 | private static class MyReducer extends Reducer { 156 | private static final PairOfIntLong output = new PairOfIntLong(); 157 | private int dfMin, dfMax; 158 | 159 | @Override 160 | public void setup(Reducer.Context context) { 161 | dfMin = context.getConfiguration().getInt(HADOOP_DF_MIN_OPTION, MIN_DF_DEFAULT); 162 | dfMax = context.getConfiguration().getInt(HADOOP_DF_MAX_OPTION, Integer.MAX_VALUE); 163 | LOG.info("dfMin = " + dfMin); 164 | } 165 | 166 | @Override 167 | public void reduce(Text key, Iterable values, Context context) 168 | throws IOException, InterruptedException { 169 | int df = 0; 170 | long cf = 0; 171 | for (PairOfIntLong pair : values) { 172 | df += pair.getLeftElement(); 173 | cf += pair.getRightElement(); 174 | } 175 | if (df < dfMin || df > dfMax) { 176 | return; 177 | } 178 | output.set(df, cf); 179 | context.write(key, output); 180 | } 181 | } 182 | 183 | public static final String INPUT_OPTION = "input"; 184 | public static final String OUTPUT_OPTION = "output"; 185 | public static final String DF_MIN_OPTION = "dfMin"; 186 | public static final String PREPROCESSING = "preprocessing"; 187 | 188 | /** 189 | * Runs this tool. 190 | */ 191 | @SuppressWarnings("static-access") 192 | public int run(String[] args) throws Exception { 193 | Options options = new Options(); 194 | 195 | options.addOption(OptionBuilder.withArgName("path").hasArg() 196 | .withDescription("input path").create(INPUT_OPTION)); 197 | options.addOption(OptionBuilder.withArgName("path").hasArg() 198 | .withDescription("output path").create(OUTPUT_OPTION)); 199 | options.addOption(OptionBuilder.withArgName("num").hasArg() 200 | .withDescription("minimum df").create(DF_MIN_OPTION)); 201 | options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg() 202 | .withDescription("preprocessing").create(PREPROCESSING)); 203 | 204 | CommandLine cmdline; 205 | CommandLineParser parser = new GnuParser(); 206 | try { 207 | cmdline = parser.parse(options, args); 208 | } catch (ParseException exp) { 209 | HelpFormatter formatter = new HelpFormatter(); 210 | formatter.printHelp(this.getClass().getName(), options); 211 | ToolRunner.printGenericCommandUsage(System.out); 212 | System.err.println("Error parsing command line: " + exp.getMessage()); 213 | return -1; 214 | } 215 | 216 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) 217 | || !cmdline.hasOption(PREPROCESSING)) { 218 | HelpFormatter formatter = new HelpFormatter(); 219 | formatter.printHelp(this.getClass().getName(), options); 220 | ToolRunner.printGenericCommandUsage(System.out); 221 | return -1; 222 | } 223 | 224 | String input = cmdline.getOptionValue(INPUT_OPTION); 225 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 226 | String preprocessing = cmdline.getOptionValue(PREPROCESSING); 227 | 228 | LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); 229 | LOG.info(" - input: " + input); 230 | LOG.info(" - output: " + output); 231 | LOG.info(" - preprocessing: " + preprocessing); 232 | 233 | getConf().set(PREPROCESSING, preprocessing); 234 | 235 | Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input); 236 | job.setJarByClass(ComputeTermStatistics.class); 237 | 238 | job.setNumReduceTasks(100); 239 | 240 | if (cmdline.hasOption(DF_MIN_OPTION)) { 241 | int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION)); 242 | LOG.info(" - dfMin: " + dfMin); 243 | job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin); 244 | } 245 | 246 | FileInputFormat.setInputPaths(job, input); 247 | FileOutputFormat.setOutputPath(job, new Path(output)); 248 | 249 | job.setInputFormatClass(ClueWeb12InputFormat.class); 250 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 251 | 252 | job.setMapOutputKeyClass(Text.class); 253 | job.setMapOutputValueClass(PairOfIntLong.class); 254 | job.setOutputKeyClass(Text.class); 255 | job.setOutputValueClass(PairOfIntLong.class); 256 | 257 | job.setMapperClass(MyMapper.class); 258 | job.setCombinerClass(MyCombiner.class); 259 | job.setReducerClass(MyReducer.class); 260 | 261 | FileSystem.get(getConf()).delete(new Path(output), true); 262 | 263 | long startTime = System.currentTimeMillis(); 264 | job.waitForCompletion(true); 265 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 266 | 267 | return 0; 268 | } 269 | 270 | /** 271 | * Dispatches command-line arguments to the tool via the ToolRunner. 272 | */ 273 | public static void main(String[] args) throws Exception { 274 | LOG.info("Running " + ComputeTermStatistics.class.getCanonicalName() + " with args " 275 | + Arrays.toString(args)); 276 | ToolRunner.run(new ComputeTermStatistics(), args); 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsNew.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.io.LongWritable; 31 | import org.apache.hadoop.io.NullWritable; 32 | import org.apache.hadoop.mapreduce.Counters; 33 | import org.apache.hadoop.mapreduce.Job; 34 | import org.apache.hadoop.mapreduce.Mapper; 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 36 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 37 | import org.apache.hadoop.util.Tool; 38 | import org.apache.hadoop.util.ToolRunner; 39 | import org.apache.log4j.Logger; 40 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 41 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat; 42 | 43 | public class CountWarcRecordsNew extends Configured implements Tool { 44 | private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class); 45 | 46 | private static enum Records { TOTAL, PAGES }; 47 | 48 | private static class MyMapper 49 | extends Mapper { 50 | @Override 51 | public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) 52 | throws IOException, InterruptedException { 53 | context.getCounter(Records.TOTAL).increment(1); 54 | 55 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 56 | if (docid != null) { 57 | context.getCounter(Records.PAGES).increment(1); 58 | } 59 | } 60 | } 61 | 62 | public CountWarcRecordsNew() {} 63 | 64 | public static final String INPUT_OPTION = "input"; 65 | 66 | /** 67 | * Runs this tool. 68 | */ 69 | @SuppressWarnings("static-access") 70 | public int run(String[] args) throws Exception { 71 | Options options = new Options(); 72 | 73 | options.addOption(OptionBuilder.withArgName("path").hasArg() 74 | .withDescription("input path").create(INPUT_OPTION)); 75 | 76 | CommandLine cmdline; 77 | CommandLineParser parser = new GnuParser(); 78 | try { 79 | cmdline = parser.parse(options, args); 80 | } catch (ParseException exp) { 81 | HelpFormatter formatter = new HelpFormatter(); 82 | formatter.printHelp(this.getClass().getName(), options); 83 | ToolRunner.printGenericCommandUsage(System.out); 84 | System.err.println("Error parsing command line: " + exp.getMessage()); 85 | return -1; 86 | } 87 | 88 | if (!cmdline.hasOption(INPUT_OPTION)) { 89 | HelpFormatter formatter = new HelpFormatter(); 90 | formatter.printHelp(this.getClass().getName(), options); 91 | ToolRunner.printGenericCommandUsage(System.out); 92 | return -1; 93 | } 94 | 95 | String input = cmdline.getOptionValue(INPUT_OPTION); 96 | 97 | LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName()); 98 | LOG.info(" - input: " + input); 99 | 100 | Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input); 101 | job.setJarByClass(CountWarcRecordsNew.class); 102 | job.setNumReduceTasks(0); 103 | 104 | FileInputFormat.addInputPaths(job, input); 105 | 106 | job.setInputFormatClass(ClueWeb12InputFormat.class); 107 | job.setOutputFormatClass(NullOutputFormat.class); 108 | job.setMapperClass(MyMapper.class); 109 | 110 | job.waitForCompletion(true); 111 | 112 | Counters counters = job.getCounters(); 113 | int numDocs = (int) counters.findCounter(Records.PAGES).getValue(); 114 | LOG.info("Read " + numDocs + " docs."); 115 | 116 | return 0; 117 | } 118 | 119 | /** 120 | * Dispatches command-line arguments to the tool via the ToolRunner. 121 | */ 122 | public static void main(String[] args) throws Exception { 123 | LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args " 124 | + Arrays.toString(args)); 125 | ToolRunner.run(new CountWarcRecordsNew(), args); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsOld.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.io.NullWritable; 31 | import org.apache.hadoop.io.Writable; 32 | import org.apache.hadoop.mapred.Counters; 33 | import org.apache.hadoop.mapred.FileInputFormat; 34 | import org.apache.hadoop.mapred.JobClient; 35 | import org.apache.hadoop.mapred.JobConf; 36 | import org.apache.hadoop.mapred.MapReduceBase; 37 | import org.apache.hadoop.mapred.Mapper; 38 | import org.apache.hadoop.mapred.OutputCollector; 39 | import org.apache.hadoop.mapred.Reporter; 40 | import org.apache.hadoop.mapred.RunningJob; 41 | import org.apache.hadoop.mapred.lib.NullOutputFormat; 42 | import org.apache.hadoop.util.Tool; 43 | import org.apache.hadoop.util.ToolRunner; 44 | import org.apache.log4j.Logger; 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 46 | import org.clueweb.clueweb12.mapred.ClueWeb12InputFormat; 47 | 48 | public class CountWarcRecordsOld extends Configured implements Tool { 49 | private static final Logger LOG = Logger.getLogger(CountWarcRecordsOld.class); 50 | 51 | private static enum Records { TOTAL, PAGES }; 52 | 53 | private static class MyMapper extends MapReduceBase implements 54 | Mapper { 55 | 56 | public void configure(JobConf job) {} 57 | 58 | public void map(Writable key, ClueWeb12WarcRecord doc, 59 | OutputCollector output, Reporter reporter) throws IOException { 60 | reporter.incrCounter(Records.TOTAL, 1); 61 | 62 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 63 | if (docid != null) { 64 | reporter.incrCounter(Records.PAGES, 1); 65 | } 66 | } 67 | } 68 | 69 | public CountWarcRecordsOld() { 70 | } 71 | 72 | public static final String INPUT_OPTION = "input"; 73 | 74 | /** 75 | * Runs this tool. 76 | */ 77 | @SuppressWarnings("static-access") 78 | public int run(String[] args) throws Exception { 79 | Options options = new Options(); 80 | 81 | options.addOption(OptionBuilder.withArgName("path").hasArg() 82 | .withDescription("input path").create(INPUT_OPTION)); 83 | 84 | CommandLine cmdline; 85 | CommandLineParser parser = new GnuParser(); 86 | try { 87 | cmdline = parser.parse(options, args); 88 | } catch (ParseException exp) { 89 | HelpFormatter formatter = new HelpFormatter(); 90 | formatter.printHelp(this.getClass().getName(), options); 91 | ToolRunner.printGenericCommandUsage(System.out); 92 | System.err.println("Error parsing command line: " + exp.getMessage()); 93 | return -1; 94 | } 95 | 96 | if (!cmdline.hasOption(INPUT_OPTION)) { 97 | HelpFormatter formatter = new HelpFormatter(); 98 | formatter.printHelp(this.getClass().getName(), options); 99 | ToolRunner.printGenericCommandUsage(System.out); 100 | return -1; 101 | } 102 | 103 | String input = cmdline.getOptionValue(INPUT_OPTION); 104 | 105 | LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName()); 106 | LOG.info(" - input: " + input); 107 | 108 | JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class); 109 | conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input); 110 | 111 | conf.setNumReduceTasks(0); 112 | 113 | FileInputFormat.addInputPaths(conf, input); 114 | 115 | conf.setInputFormat(ClueWeb12InputFormat.class); 116 | conf.setOutputFormat(NullOutputFormat.class); 117 | conf.setMapperClass(MyMapper.class); 118 | 119 | RunningJob job = JobClient.runJob(conf); 120 | Counters counters = job.getCounters(); 121 | int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); 122 | 123 | LOG.info("Read " + numDocs + " docs."); 124 | 125 | return 0; 126 | } 127 | 128 | /** 129 | * Dispatches command-line arguments to the tool via the ToolRunner. 130 | */ 131 | public static void main(String[] args) throws Exception { 132 | LOG.info("Running " + CountWarcRecordsOld.class.getCanonicalName() + " with args " 133 | + Arrays.toString(args)); 134 | ToolRunner.run(new CountWarcRecordsOld(), args); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToPlainText.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.fs.Path; 31 | import org.apache.hadoop.io.Text; 32 | import org.apache.hadoop.io.Writable; 33 | import org.apache.hadoop.mapred.Counters; 34 | import org.apache.hadoop.mapred.FileInputFormat; 35 | import org.apache.hadoop.mapred.FileOutputFormat; 36 | import org.apache.hadoop.mapred.JobClient; 37 | import org.apache.hadoop.mapred.JobConf; 38 | import org.apache.hadoop.mapred.MapReduceBase; 39 | import org.apache.hadoop.mapred.Mapper; 40 | import org.apache.hadoop.mapred.OutputCollector; 41 | import org.apache.hadoop.mapred.Reporter; 42 | import org.apache.hadoop.mapred.RunningJob; 43 | import org.apache.hadoop.mapred.TextOutputFormat; 44 | import org.apache.hadoop.util.Tool; 45 | import org.apache.hadoop.util.ToolRunner; 46 | import org.apache.log4j.Logger; 47 | import org.apache.lucene.analysis.Analyzer; 48 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 49 | import org.apache.lucene.util.Version; 50 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 51 | import org.clueweb.clueweb12.mapred.ClueWeb12InputFormat; 52 | import org.jsoup.Jsoup; 53 | 54 | import tl.lin.lucene.AnalyzerUtils; 55 | 56 | import com.google.common.base.Joiner; 57 | 58 | public class DumpWarcRecordsToPlainText extends Configured implements Tool { 59 | private static final Logger LOG = Logger.getLogger(DumpWarcRecordsToPlainText.class); 60 | 61 | private static enum Records { TOTAL, PAGES, ERRORS }; 62 | private static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43); 63 | private static final Joiner JOINER = Joiner.on("|"); 64 | 65 | private static class MyMapper extends MapReduceBase implements 66 | Mapper { 67 | private static final Text KEY = new Text(); 68 | private static final Text VALUE = new Text(); 69 | 70 | public void configure(JobConf job) {} 71 | 72 | public void map(Writable key, ClueWeb12WarcRecord doc, OutputCollector output, 73 | Reporter reporter) throws IOException { 74 | reporter.incrCounter(Records.TOTAL, 1); 75 | 76 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 77 | if (docid != null) { 78 | reporter.incrCounter(Records.PAGES, 1); 79 | try { 80 | KEY.set(docid); 81 | String cleaned = Jsoup.parse(doc.getContent()).text().replaceAll("[\\r\\n]+", " "); 82 | cleaned = JOINER.join(AnalyzerUtils.parse(ANALYZER, cleaned)); 83 | VALUE.set(cleaned); 84 | output.collect(KEY, VALUE); 85 | } catch (Exception e) { 86 | // If Jsoup throws any exceptions, catch and move on. 87 | reporter.incrCounter(Records.ERRORS, 1); 88 | } 89 | } 90 | } 91 | } 92 | 93 | public DumpWarcRecordsToPlainText() {} 94 | 95 | public static final String INPUT_OPTION = "input"; 96 | public static final String OUTPUT_OPTION = "output"; 97 | 98 | /** 99 | * Runs this tool. 100 | */ 101 | @SuppressWarnings("static-access") 102 | public int run(String[] args) throws Exception { 103 | Options options = new Options(); 104 | 105 | options.addOption(OptionBuilder.withArgName("path").hasArg() 106 | .withDescription("input path").create(INPUT_OPTION)); 107 | options.addOption(OptionBuilder.withArgName("path").hasArg() 108 | .withDescription("output path").create(OUTPUT_OPTION)); 109 | 110 | CommandLine cmdline; 111 | CommandLineParser parser = new GnuParser(); 112 | try { 113 | cmdline = parser.parse(options, args); 114 | } catch (ParseException exp) { 115 | HelpFormatter formatter = new HelpFormatter(); 116 | formatter.printHelp(this.getClass().getName(), options); 117 | ToolRunner.printGenericCommandUsage(System.out); 118 | System.err.println("Error parsing command line: " + exp.getMessage()); 119 | return -1; 120 | } 121 | 122 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { 123 | HelpFormatter formatter = new HelpFormatter(); 124 | formatter.printHelp(this.getClass().getName(), options); 125 | ToolRunner.printGenericCommandUsage(System.out); 126 | return -1; 127 | } 128 | 129 | String input = cmdline.getOptionValue(INPUT_OPTION); 130 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 131 | 132 | LOG.info("Tool name: " + DumpWarcRecordsToPlainText.class.getSimpleName()); 133 | LOG.info(" - input: " + input); 134 | LOG.info(" - output: " + output); 135 | 136 | JobConf conf = new JobConf(getConf(), DumpWarcRecordsToPlainText.class); 137 | conf.setJobName(DumpWarcRecordsToPlainText.class.getSimpleName() + ":" + input); 138 | 139 | conf.setNumReduceTasks(0); 140 | 141 | FileInputFormat.addInputPaths(conf, input); 142 | FileOutputFormat.setOutputPath(conf, new Path(output)); 143 | 144 | conf.setInputFormat(ClueWeb12InputFormat.class); 145 | conf.setOutputFormat(TextOutputFormat.class); 146 | conf.setMapperClass(MyMapper.class); 147 | 148 | RunningJob job = JobClient.runJob(conf); 149 | Counters counters = job.getCounters(); 150 | int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); 151 | 152 | LOG.info("Read " + numDocs + " docs."); 153 | 154 | return 0; 155 | } 156 | 157 | /** 158 | * Dispatches command-line arguments to the tool via the ToolRunner. 159 | */ 160 | public static void main(String[] args) throws Exception { 161 | LOG.info("Running " + DumpWarcRecordsToPlainText.class.getCanonicalName() + " with args " 162 | + Arrays.toString(args)); 163 | ToolRunner.run(new DumpWarcRecordsToPlainText(), args); 164 | } 165 | } -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToTermIds.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.LongWritable; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.mapreduce.Job; 36 | import org.apache.hadoop.mapreduce.Mapper; 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 40 | import org.apache.hadoop.util.Tool; 41 | import org.apache.hadoop.util.ToolRunner; 42 | import org.apache.log4j.Logger; 43 | import org.apache.lucene.analysis.Analyzer; 44 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 45 | import org.apache.lucene.util.Version; 46 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 47 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat; 48 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary; 49 | import org.jsoup.Jsoup; 50 | 51 | import tl.lin.lucene.AnalyzerUtils; 52 | 53 | public class DumpWarcRecordsToTermIds extends Configured implements Tool { 54 | private static final Logger LOG = Logger.getLogger(DumpWarcRecordsToTermIds.class); 55 | 56 | private static enum Records { TOTAL, PAGES, ERRORS, TOO_LONG }; 57 | 58 | private static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43); 59 | 60 | private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this. 61 | 62 | private static class MyMapper extends Mapper { 63 | private static final Text DOCID = new Text(); 64 | private static final Text DOC = new Text(); 65 | private static final Text EMPTY = new Text(); 66 | 67 | private DefaultFrequencySortedDictionary dictionary; 68 | 69 | @Override 70 | public void setup(Context context) throws IOException { 71 | FileSystem fs = FileSystem.get(context.getConfiguration()); 72 | String path = context.getConfiguration().get(DICTIONARY_OPTION); 73 | dictionary = new DefaultFrequencySortedDictionary(path, fs); 74 | } 75 | 76 | @Override 77 | public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) 78 | throws IOException, InterruptedException { 79 | 80 | context.getCounter(Records.TOTAL).increment(1); 81 | 82 | String docid = doc.getHeaderMetadataItem("WARC-TREC-ID"); 83 | if (docid != null) { 84 | DOCID.set(docid); 85 | 86 | context.getCounter(Records.PAGES).increment(1); 87 | try { 88 | String content = doc.getContent(); 89 | 90 | // If the document is excessively long, it usually means that something is wrong (e.g., a 91 | // binary object). Skip so the parsing doesn't choke. 92 | // As an alternative, we might want to consider putting in a timeout, e.g., 93 | // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread 94 | if ( content.length() > MAX_DOC_LENGTH ) { 95 | LOG.info("Skipping " + docid + " due to excessive length: " + content.length()); 96 | context.getCounter(Records.TOO_LONG).increment(1); 97 | context.write(DOCID, EMPTY); 98 | return; 99 | } 100 | 101 | String cleaned = Jsoup.parse(content).text(); 102 | List tokens = AnalyzerUtils.parse(ANALYZER, cleaned); 103 | 104 | int len = 0; 105 | int[] termids = new int[tokens.size()]; 106 | for (String token : tokens) { 107 | int id = dictionary.getId(token); 108 | if (id != -1) { 109 | termids[len] = id; 110 | len++; 111 | } 112 | } 113 | 114 | int[] copy = new int[len]; 115 | System.arraycopy(termids, 0, copy, 0, len); 116 | DOC.set(Arrays.toString(copy)); 117 | context.write(DOCID, DOC); 118 | } 119 | catch (Exception e) { 120 | // If Jsoup throws any exceptions, catch and move on, but emit empty doc. 121 | LOG.info("Error caught processing " + docid); 122 | context.getCounter(Records.ERRORS).increment(1); 123 | context.write(DOCID, EMPTY); 124 | } 125 | } 126 | } 127 | } 128 | 129 | public static final String INPUT_OPTION = "input"; 130 | public static final String OUTPUT_OPTION = "output"; 131 | public static final String DICTIONARY_OPTION = "dictionary"; 132 | public static final String REDUCERS_OPTION = "reducers"; 133 | 134 | /** 135 | * Runs this tool. 136 | */ 137 | @SuppressWarnings("static-access") 138 | public int run(String[] args) throws Exception { 139 | Options options = new Options(); 140 | 141 | options.addOption(OptionBuilder.withArgName("path").hasArg() 142 | .withDescription("input path").create(INPUT_OPTION)); 143 | options.addOption(OptionBuilder.withArgName("path").hasArg() 144 | .withDescription("output path").create(OUTPUT_OPTION)); 145 | options.addOption(OptionBuilder.withArgName("path").hasArg() 146 | .withDescription("dictionary").create(DICTIONARY_OPTION)); 147 | options.addOption(OptionBuilder.withArgName("num").hasArg() 148 | .withDescription("number of reducers").create(REDUCERS_OPTION)); 149 | 150 | CommandLine cmdline; 151 | CommandLineParser parser = new GnuParser(); 152 | try { 153 | cmdline = parser.parse(options, args); 154 | } catch (ParseException exp) { 155 | HelpFormatter formatter = new HelpFormatter(); 156 | formatter.printHelp(this.getClass().getName(), options); 157 | ToolRunner.printGenericCommandUsage(System.out); 158 | System.err.println("Error parsing command line: " + exp.getMessage()); 159 | return -1; 160 | } 161 | 162 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || 163 | !cmdline.hasOption(DICTIONARY_OPTION)) { 164 | HelpFormatter formatter = new HelpFormatter(); 165 | formatter.printHelp(this.getClass().getName(), options); 166 | ToolRunner.printGenericCommandUsage(System.out); 167 | return -1; 168 | } 169 | 170 | String input = cmdline.getOptionValue(INPUT_OPTION); 171 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 172 | String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); 173 | 174 | Job job = new Job(getConf(), DumpWarcRecordsToTermIds.class.getSimpleName() + ":" + input); 175 | job.setJarByClass(DumpWarcRecordsToTermIds.class); 176 | 177 | LOG.info("Tool name: " + DumpWarcRecordsToTermIds.class.getSimpleName()); 178 | LOG.info(" - input: " + input); 179 | LOG.info(" - output: " + output); 180 | LOG.info(" - dictionary: " + dictionary); 181 | 182 | if (cmdline.hasOption(REDUCERS_OPTION)) { 183 | int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION)); 184 | LOG.info(" - reducers: " + numReducers); 185 | job.setNumReduceTasks(numReducers); 186 | } else { 187 | job.setNumReduceTasks(0); 188 | } 189 | 190 | FileInputFormat.setInputPaths(job, input); 191 | FileOutputFormat.setOutputPath(job, new Path(output)); 192 | 193 | job.getConfiguration().set(DICTIONARY_OPTION, dictionary); 194 | 195 | job.setInputFormatClass(ClueWeb12InputFormat.class); 196 | job.setOutputFormatClass(TextOutputFormat.class); 197 | 198 | job.setMapOutputKeyClass(Text.class); 199 | job.setMapOutputValueClass(Text.class); 200 | job.setOutputKeyClass(Text.class); 201 | job.setOutputValueClass(Text.class); 202 | 203 | job.setMapperClass(MyMapper.class); 204 | 205 | FileSystem.get(getConf()).delete(new Path(output), true); 206 | 207 | long startTime = System.currentTimeMillis(); 208 | job.waitForCompletion(true); 209 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 210 | 211 | return 0; 212 | } 213 | 214 | /** 215 | * Dispatches command-line arguments to the tool via the ToolRunner. 216 | */ 217 | public static void main(String[] args) throws Exception { 218 | LOG.info("Running " + DumpWarcRecordsToTermIds.class.getCanonicalName() + " with args " 219 | + Arrays.toString(args)); 220 | ToolRunner.run(new DumpWarcRecordsToTermIds(), args); 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/LookupWarcTrecIdMapping.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import org.apache.commons.cli.CommandLine; 20 | import org.apache.commons.cli.CommandLineParser; 21 | import org.apache.commons.cli.GnuParser; 22 | import org.apache.commons.cli.HelpFormatter; 23 | import org.apache.commons.cli.OptionBuilder; 24 | import org.apache.commons.cli.Options; 25 | import org.apache.commons.cli.ParseException; 26 | import org.apache.hadoop.conf.Configured; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.util.Tool; 29 | import org.apache.hadoop.util.ToolRunner; 30 | import org.clueweb.data.WarcTrecIdMapping; 31 | 32 | public class LookupWarcTrecIdMapping extends Configured implements Tool { 33 | private static final String INDEX_OPTION = "index"; 34 | private static final String DOCID_OPTION = "docid"; 35 | private static final String DOCNO_OPTION = "docno"; 36 | 37 | @SuppressWarnings("static-access") 38 | public int run(String[] args) throws Exception { 39 | Options options = new Options(); 40 | options.addOption(OptionBuilder.withArgName("dir").hasArg() 41 | .withDescription("index location").create(INDEX_OPTION)); 42 | options.addOption(OptionBuilder.withArgName("id").hasArg() 43 | .withDescription("WARC-TREC-ID").create(DOCID_OPTION)); 44 | options.addOption(OptionBuilder.withArgName("num").hasArg() 45 | .withDescription("docno").create(DOCNO_OPTION)); 46 | 47 | CommandLine cmdline = null; 48 | CommandLineParser parser = new GnuParser(); 49 | try { 50 | cmdline = parser.parse(options, args); 51 | } catch (ParseException exp) { 52 | System.err.println("Error parsing command line: " + exp.getMessage()); 53 | System.exit(-1); 54 | } 55 | 56 | if (!cmdline.hasOption(INDEX_OPTION) || 57 | !(cmdline.hasOption(DOCID_OPTION) || cmdline.hasOption(DOCNO_OPTION))) { 58 | HelpFormatter formatter = new HelpFormatter(); 59 | formatter.printHelp(LookupWarcTrecIdMapping.class.getCanonicalName(), options); 60 | System.exit(-1); 61 | } 62 | 63 | String indexPath = cmdline.getOptionValue(INDEX_OPTION); 64 | 65 | WarcTrecIdMapping mapping = new WarcTrecIdMapping(new Path(indexPath), getConf()); 66 | if (cmdline.hasOption(DOCID_OPTION)) { 67 | System.out.println(mapping.getDocno(cmdline.getOptionValue(DOCID_OPTION))); 68 | } 69 | 70 | if (cmdline.hasOption(DOCNO_OPTION)) { 71 | System.out.println(mapping.getDocid(Integer.parseInt(cmdline.getOptionValue(DOCNO_OPTION)))); 72 | } 73 | 74 | return 0; 75 | } 76 | 77 | public static void main(String[] args) throws Exception { 78 | ToolRunner.run(new LookupWarcTrecIdMapping(), args); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/MergeTermStatistics.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | import org.apache.commons.cli.CommandLine; 23 | import org.apache.commons.cli.CommandLineParser; 24 | import org.apache.commons.cli.GnuParser; 25 | import org.apache.commons.cli.HelpFormatter; 26 | import org.apache.commons.cli.OptionBuilder; 27 | import org.apache.commons.cli.Options; 28 | import org.apache.commons.cli.ParseException; 29 | import org.apache.hadoop.conf.Configured; 30 | import org.apache.hadoop.fs.FileSystem; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.io.Text; 33 | import org.apache.hadoop.mapreduce.Job; 34 | import org.apache.hadoop.mapreduce.Reducer; 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 36 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 37 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 38 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 39 | import org.apache.hadoop.util.Tool; 40 | import org.apache.hadoop.util.ToolRunner; 41 | import org.apache.log4j.Logger; 42 | 43 | import tl.lin.data.pair.PairOfIntLong; 44 | 45 | public class MergeTermStatistics extends Configured implements Tool { 46 | private static final Logger LOG = Logger.getLogger(MergeTermStatistics.class); 47 | 48 | private static final String HADOOP_DF_MIN_OPTION = "df.min"; 49 | private static final String HADOOP_DF_MAX_OPTION = "df.max"; 50 | 51 | private static final int MIN_DF_DEFAULT = 100; // Throw away terms with df less than this. 52 | 53 | private static class MyCombiner extends Reducer { 54 | private static final PairOfIntLong output = new PairOfIntLong(); 55 | 56 | @Override 57 | public void reduce(Text key, Iterable values, Context context) 58 | throws IOException, InterruptedException { 59 | int df = 0; 60 | long cf = 0; 61 | for (PairOfIntLong pair : values) { 62 | df += pair.getLeftElement(); 63 | cf += pair.getRightElement(); 64 | } 65 | 66 | output.set(df, cf); 67 | context.write(key, output); 68 | } 69 | } 70 | 71 | private static class MyReducer extends Reducer { 72 | private static final PairOfIntLong output = new PairOfIntLong(); 73 | private int dfMin, dfMax; 74 | 75 | @Override 76 | public void setup(Reducer.Context context) { 77 | dfMin = context.getConfiguration().getInt(HADOOP_DF_MIN_OPTION, MIN_DF_DEFAULT); 78 | dfMax = context.getConfiguration().getInt(HADOOP_DF_MAX_OPTION, Integer.MAX_VALUE); 79 | LOG.info("dfMin = " + dfMin); 80 | } 81 | 82 | @Override 83 | public void reduce(Text key, Iterable values, Context context) 84 | throws IOException, InterruptedException { 85 | int df = 0; 86 | long cf = 0; 87 | for (PairOfIntLong pair : values) { 88 | df += pair.getLeftElement(); 89 | cf += pair.getRightElement(); 90 | } 91 | if (df < dfMin || df > dfMax) { 92 | return; 93 | } 94 | output.set(df, cf); 95 | context.write(key, output); 96 | } 97 | } 98 | 99 | public static final String INPUT_OPTION = "input"; 100 | public static final String OUTPUT_OPTION = "output"; 101 | public static final String DF_MIN_OPTION = "dfMin"; 102 | 103 | /** 104 | * Runs this tool. 105 | */ 106 | @SuppressWarnings("static-access") 107 | public int run(String[] args) throws Exception { 108 | Options options = new Options(); 109 | 110 | options.addOption(OptionBuilder.withArgName("path").hasArg() 111 | .withDescription("input path").create(INPUT_OPTION)); 112 | options.addOption(OptionBuilder.withArgName("path").hasArg() 113 | .withDescription("output path").create(OUTPUT_OPTION)); 114 | options.addOption(OptionBuilder.withArgName("num").hasArg() 115 | .withDescription("minimum df").create(DF_MIN_OPTION)); 116 | 117 | CommandLine cmdline; 118 | CommandLineParser parser = new GnuParser(); 119 | try { 120 | cmdline = parser.parse(options, args); 121 | } catch (ParseException exp) { 122 | HelpFormatter formatter = new HelpFormatter(); 123 | formatter.printHelp(this.getClass().getName(), options); 124 | ToolRunner.printGenericCommandUsage(System.out); 125 | System.err.println("Error parsing command line: " + exp.getMessage()); 126 | return -1; 127 | } 128 | 129 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { 130 | HelpFormatter formatter = new HelpFormatter(); 131 | formatter.printHelp(this.getClass().getName(), options); 132 | ToolRunner.printGenericCommandUsage(System.out); 133 | return -1; 134 | } 135 | 136 | String input = cmdline.getOptionValue(INPUT_OPTION); 137 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 138 | 139 | LOG.info("Tool name: " + MergeTermStatistics.class.getSimpleName()); 140 | LOG.info(" - input: " + input); 141 | LOG.info(" - output: " + output); 142 | 143 | Job job = new Job(getConf(), MergeTermStatistics.class.getSimpleName() + ":" + input); 144 | job.setJarByClass(MergeTermStatistics.class); 145 | 146 | job.setNumReduceTasks(100); 147 | 148 | if (cmdline.hasOption(DF_MIN_OPTION)) { 149 | int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION)); 150 | LOG.info(" - dfMin: " + dfMin); 151 | job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin); 152 | } 153 | 154 | FileInputFormat.setInputPaths(job, input); 155 | FileOutputFormat.setOutputPath(job, new Path(output)); 156 | 157 | job.setInputFormatClass(SequenceFileInputFormat.class); 158 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 159 | 160 | job.setMapOutputKeyClass(Text.class); 161 | job.setMapOutputValueClass(PairOfIntLong.class); 162 | job.setOutputKeyClass(Text.class); 163 | job.setOutputValueClass(PairOfIntLong.class); 164 | 165 | job.setCombinerClass(MyCombiner.class); 166 | job.setReducerClass(MyReducer.class); 167 | 168 | FileSystem.get(getConf()).delete(new Path(output), true); 169 | 170 | long startTime = System.currentTimeMillis(); 171 | job.waitForCompletion(true); 172 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 173 | 174 | return 0; 175 | } 176 | 177 | /** 178 | * Dispatches command-line arguments to the tool via the ToolRunner. 179 | */ 180 | public static void main(String[] args) throws Exception { 181 | LOG.info("Running " + MergeTermStatistics.class.getCanonicalName() + " with args " 182 | + Arrays.toString(args)); 183 | ToolRunner.run(new MergeTermStatistics(), args); 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ProcessPForDocVectors.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.hadoop.mapreduce.Job; 35 | import org.apache.hadoop.mapreduce.Mapper; 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 37 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 40 | import org.apache.hadoop.util.Tool; 41 | import org.apache.hadoop.util.ToolRunner; 42 | import org.apache.log4j.Logger; 43 | import org.clueweb.data.PForDocVector; 44 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary; 45 | 46 | import tl.lin.data.array.IntArrayWritable; 47 | 48 | import com.google.common.base.Joiner; 49 | import com.google.common.collect.Lists; 50 | 51 | public class ProcessPForDocVectors extends Configured implements Tool { 52 | private static final Logger LOG = Logger.getLogger(ProcessPForDocVectors.class); 53 | 54 | private static final Joiner JOINER = Joiner.on("|"); 55 | 56 | private static class MyMapper extends Mapper { 57 | private static final PForDocVector DOC = new PForDocVector(); 58 | 59 | private DefaultFrequencySortedDictionary dictionary; 60 | 61 | @Override 62 | public void setup(Context context) throws IOException { 63 | FileSystem fs = FileSystem.get(context.getConfiguration()); 64 | String path = context.getConfiguration().get(DICTIONARY_OPTION); 65 | dictionary = new DefaultFrequencySortedDictionary(path, fs); 66 | } 67 | 68 | @Override 69 | public void map(Text key, IntArrayWritable ints, Context context) 70 | throws IOException, InterruptedException { 71 | PForDocVector.fromIntArrayWritable(ints, DOC); 72 | 73 | List terms = Lists.newArrayList(); 74 | for (int termid : DOC.getTermIds()) { 75 | terms.add(dictionary.getTerm(termid)); 76 | } 77 | 78 | context.write(key, new Text(JOINER.join(terms))); 79 | } 80 | } 81 | 82 | public static final String INPUT_OPTION = "input"; 83 | public static final String OUTPUT_OPTION = "output"; 84 | public static final String DICTIONARY_OPTION = "dictionary"; 85 | 86 | /** 87 | * Runs this tool. 88 | */ 89 | @SuppressWarnings("static-access") 90 | public int run(String[] args) throws Exception { 91 | Options options = new Options(); 92 | 93 | options.addOption(OptionBuilder.withArgName("path").hasArg() 94 | .withDescription("input path").create(INPUT_OPTION)); 95 | options.addOption(OptionBuilder.withArgName("path").hasArg() 96 | .withDescription("output path").create(OUTPUT_OPTION)); 97 | options.addOption(OptionBuilder.withArgName("path").hasArg() 98 | .withDescription("dictionary").create(DICTIONARY_OPTION)); 99 | 100 | CommandLine cmdline; 101 | CommandLineParser parser = new GnuParser(); 102 | try { 103 | cmdline = parser.parse(options, args); 104 | } catch (ParseException exp) { 105 | HelpFormatter formatter = new HelpFormatter(); 106 | formatter.printHelp(this.getClass().getName(), options); 107 | ToolRunner.printGenericCommandUsage(System.out); 108 | System.err.println("Error parsing command line: " + exp.getMessage()); 109 | return -1; 110 | } 111 | 112 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || 113 | !cmdline.hasOption(DICTIONARY_OPTION)) { 114 | HelpFormatter formatter = new HelpFormatter(); 115 | formatter.printHelp(this.getClass().getName(), options); 116 | ToolRunner.printGenericCommandUsage(System.out); 117 | return -1; 118 | } 119 | 120 | String input = cmdline.getOptionValue(INPUT_OPTION); 121 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 122 | String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); 123 | 124 | LOG.info("Tool name: " + ProcessPForDocVectors.class.getSimpleName()); 125 | LOG.info(" - input: " + input); 126 | LOG.info(" - output: " + output); 127 | LOG.info(" - dictionary: " + dictionary); 128 | 129 | Job job = new Job(getConf(), ProcessPForDocVectors.class.getSimpleName() + ":" + input); 130 | job.setJarByClass(ProcessPForDocVectors.class); 131 | 132 | job.setNumReduceTasks(0); 133 | 134 | FileInputFormat.setInputPaths(job, input); 135 | FileOutputFormat.setOutputPath(job, new Path(output)); 136 | 137 | job.getConfiguration().set(DICTIONARY_OPTION, dictionary); 138 | 139 | job.setInputFormatClass(SequenceFileInputFormat.class); 140 | job.setOutputFormatClass(TextOutputFormat.class); 141 | 142 | job.setMapOutputKeyClass(Text.class); 143 | job.setMapOutputValueClass(Text.class); 144 | 145 | job.setMapperClass(MyMapper.class); 146 | 147 | FileSystem.get(getConf()).delete(new Path(output), true); 148 | 149 | long startTime = System.currentTimeMillis(); 150 | job.waitForCompletion(true); 151 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 152 | 153 | return 0; 154 | } 155 | 156 | /** 157 | * Dispatches command-line arguments to the tool via the ToolRunner. 158 | */ 159 | public static void main(String[] args) throws Exception { 160 | LOG.info("Running " + ProcessPForDocVectors.class.getCanonicalName() + " with args " 161 | + Arrays.toString(args)); 162 | ToolRunner.run(new ProcessPForDocVectors(), args); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/app/ProcessVByteDocVectors.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.app; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import org.apache.commons.cli.CommandLine; 24 | import org.apache.commons.cli.CommandLineParser; 25 | import org.apache.commons.cli.GnuParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.hadoop.conf.Configured; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.BytesWritable; 34 | import org.apache.hadoop.io.Text; 35 | import org.apache.hadoop.mapreduce.Job; 36 | import org.apache.hadoop.mapreduce.Mapper; 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 38 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 40 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 41 | import org.apache.hadoop.util.Tool; 42 | import org.apache.hadoop.util.ToolRunner; 43 | import org.apache.log4j.Logger; 44 | import org.clueweb.data.VByteDocVector; 45 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary; 46 | 47 | import com.google.common.base.Joiner; 48 | import com.google.common.collect.Lists; 49 | 50 | public class ProcessVByteDocVectors extends Configured implements Tool { 51 | private static final Logger LOG = Logger.getLogger(ProcessVByteDocVectors.class); 52 | 53 | private static final Joiner JOINER = Joiner.on("|"); 54 | 55 | private static class MyMapper extends Mapper { 56 | private static final VByteDocVector DOC = new VByteDocVector(); 57 | 58 | private DefaultFrequencySortedDictionary dictionary; 59 | 60 | @Override 61 | public void setup(Context context) throws IOException { 62 | FileSystem fs = FileSystem.get(context.getConfiguration()); 63 | String path = context.getConfiguration().get(DICTIONARY_OPTION); 64 | dictionary = new DefaultFrequencySortedDictionary(path, fs); 65 | } 66 | 67 | @Override 68 | public void map(Text key, BytesWritable bytes, Context context) 69 | throws IOException, InterruptedException { 70 | VByteDocVector.fromBytesWritable(bytes, DOC); 71 | 72 | List terms = Lists.newArrayList(); 73 | for (int termid : DOC.getTermIds()) { 74 | terms.add(dictionary.getTerm(termid)); 75 | } 76 | 77 | context.write(key, new Text(JOINER.join(terms))); 78 | } 79 | } 80 | 81 | public static final String INPUT_OPTION = "input"; 82 | public static final String OUTPUT_OPTION = "output"; 83 | public static final String DICTIONARY_OPTION = "dictionary"; 84 | 85 | /** 86 | * Runs this tool. 87 | */ 88 | @SuppressWarnings("static-access") 89 | public int run(String[] args) throws Exception { 90 | Options options = new Options(); 91 | 92 | options.addOption(OptionBuilder.withArgName("path").hasArg() 93 | .withDescription("input path").create(INPUT_OPTION)); 94 | options.addOption(OptionBuilder.withArgName("path").hasArg() 95 | .withDescription("output path").create(OUTPUT_OPTION)); 96 | options.addOption(OptionBuilder.withArgName("path").hasArg() 97 | .withDescription("dictionary").create(DICTIONARY_OPTION)); 98 | 99 | CommandLine cmdline; 100 | CommandLineParser parser = new GnuParser(); 101 | try { 102 | cmdline = parser.parse(options, args); 103 | } catch (ParseException exp) { 104 | HelpFormatter formatter = new HelpFormatter(); 105 | formatter.printHelp(this.getClass().getName(), options); 106 | ToolRunner.printGenericCommandUsage(System.out); 107 | System.err.println("Error parsing command line: " + exp.getMessage()); 108 | return -1; 109 | } 110 | 111 | if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || 112 | !cmdline.hasOption(DICTIONARY_OPTION)) { 113 | HelpFormatter formatter = new HelpFormatter(); 114 | formatter.printHelp(this.getClass().getName(), options); 115 | ToolRunner.printGenericCommandUsage(System.out); 116 | return -1; 117 | } 118 | 119 | String input = cmdline.getOptionValue(INPUT_OPTION); 120 | String output = cmdline.getOptionValue(OUTPUT_OPTION); 121 | String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION); 122 | 123 | LOG.info("Tool name: " + ProcessVByteDocVectors.class.getSimpleName()); 124 | LOG.info(" - input: " + input); 125 | LOG.info(" - output: " + output); 126 | LOG.info(" - dictionary: " + dictionary); 127 | 128 | Job job = new Job(getConf(), ProcessVByteDocVectors.class.getSimpleName() + ":" + input); 129 | job.setJarByClass(ProcessVByteDocVectors.class); 130 | 131 | job.setNumReduceTasks(0); 132 | 133 | FileInputFormat.setInputPaths(job, input); 134 | FileOutputFormat.setOutputPath(job, new Path(output)); 135 | 136 | job.getConfiguration().set(DICTIONARY_OPTION, dictionary); 137 | 138 | job.setInputFormatClass(SequenceFileInputFormat.class); 139 | job.setOutputFormatClass(TextOutputFormat.class); 140 | 141 | job.setMapOutputKeyClass(Text.class); 142 | job.setMapOutputValueClass(Text.class); 143 | 144 | job.setMapperClass(MyMapper.class); 145 | 146 | FileSystem.get(getConf()).delete(new Path(output), true); 147 | 148 | long startTime = System.currentTimeMillis(); 149 | job.waitForCompletion(true); 150 | LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); 151 | 152 | return 0; 153 | } 154 | 155 | /** 156 | * Dispatches command-line arguments to the tool via the ToolRunner. 157 | */ 158 | public static void main(String[] args) throws Exception { 159 | LOG.info("Running " + ProcessVByteDocVectors.class.getCanonicalName() + " with args " 160 | + Arrays.toString(args)); 161 | ToolRunner.run(new ProcessVByteDocVectors(), args); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/mapred/ClueWeb12InputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | /* 18 | * Hadoop FileInputFormat for reading WARC files 19 | * 20 | * (C) 2009 - Carnegie Mellon University 21 | * 22 | * 1. Redistributions of this source code must retain the above copyright 23 | * notice, this list of conditions and the following disclaimer. 24 | * 2. The names "Lemur", "Indri", "University of Massachusetts", 25 | * "Carnegie Mellon", and "lemurproject" must not be used to 26 | * endorse or promote products derived from this software without 27 | * prior written permission. To obtain permission, contact 28 | * license@lemurproject.org. 29 | * 30 | * 4. Products derived from this software may not be called "Lemur" or "Indri" 31 | * nor may "Lemur" or "Indri" appear in their names without prior written 32 | * permission of The Lemur Project. To obtain permission, 33 | * contact license@lemurproject.org. 34 | * 35 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09 36 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 37 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 38 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 39 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 40 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 41 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 42 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 45 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 46 | * POSSIBILITY OF SUCH DAMAGE. 47 | * 48 | * @author mhoy@cs.cmu.edu (Mark J. Hoy) 49 | */ 50 | 51 | package org.clueweb.clueweb12.mapred; 52 | 53 | import java.io.DataInputStream; 54 | import java.io.IOException; 55 | 56 | import org.apache.hadoop.conf.Configuration; 57 | import org.apache.hadoop.fs.FileSystem; 58 | import org.apache.hadoop.fs.Path; 59 | import org.apache.hadoop.io.LongWritable; 60 | import org.apache.hadoop.io.compress.CompressionCodec; 61 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 62 | import org.apache.hadoop.mapred.FileInputFormat; 63 | import org.apache.hadoop.mapred.FileSplit; 64 | import org.apache.hadoop.mapred.InputSplit; 65 | import org.apache.hadoop.mapred.JobConf; 66 | import org.apache.hadoop.mapred.RecordReader; 67 | import org.apache.hadoop.mapred.Reporter; 68 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 69 | 70 | public class ClueWeb12InputFormat extends FileInputFormat { 71 | 72 | /** 73 | * Don't allow the files to be split! 74 | */ 75 | @Override 76 | protected boolean isSplitable(FileSystem fs, Path filename) { 77 | // ensure the input files are not splittable! 78 | return false; 79 | } 80 | 81 | /** 82 | * Just return the record reader 83 | */ 84 | public RecordReader getRecordReader(InputSplit split, JobConf conf, 85 | Reporter reporter) throws IOException { 86 | return new ClueWarcRecordReader(conf, (FileSplit) split); 87 | } 88 | 89 | public static class ClueWarcRecordReader implements RecordReader { 90 | private long recordCount = 1; 91 | private Path path = null; 92 | private DataInputStream input = null; 93 | 94 | private long totalNumBytesRead = 0; 95 | 96 | public ClueWarcRecordReader(Configuration conf, FileSplit split) throws IOException { 97 | FileSystem fs = FileSystem.get(conf); 98 | path = split.getPath(); 99 | 100 | CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); 101 | CompressionCodec compressionCodec = compressionCodecs.getCodec(path); 102 | input = new DataInputStream(compressionCodec.createInputStream(fs.open(path))); 103 | } 104 | 105 | @Override 106 | public boolean next(LongWritable key, ClueWeb12WarcRecord value) throws IOException { 107 | DataInputStream whichStream = input; 108 | 109 | ClueWeb12WarcRecord newRecord = ClueWeb12WarcRecord.readNextWarcRecord(whichStream); 110 | if (newRecord == null) { 111 | return false; 112 | } 113 | 114 | totalNumBytesRead += (long) newRecord.getTotalRecordLength(); 115 | newRecord.setWarcFilePath(path.toString()); 116 | 117 | value.set(newRecord); 118 | key.set(recordCount); 119 | 120 | recordCount++; 121 | return true; 122 | } 123 | 124 | @Override 125 | public LongWritable createKey() { 126 | return new LongWritable(); 127 | } 128 | 129 | @Override 130 | public ClueWeb12WarcRecord createValue() { 131 | return new ClueWeb12WarcRecord(); 132 | } 133 | 134 | @Override 135 | public long getPos() throws IOException { 136 | return totalNumBytesRead; 137 | } 138 | 139 | @Override 140 | public void close() throws IOException { 141 | input.close(); 142 | } 143 | 144 | @Override 145 | public float getProgress() throws IOException { 146 | return (float) recordCount / 40000f; 147 | } 148 | } 149 | } 150 | 151 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/clueweb12/mapreduce/ClueWeb12InputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.clueweb12.mapreduce; 18 | 19 | import java.io.DataInputStream; 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.fs.FSDataInputStream; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.fs.Seekable; 27 | import org.apache.hadoop.io.LongWritable; 28 | import org.apache.hadoop.io.compress.CodecPool; 29 | import org.apache.hadoop.io.compress.CompressionCodec; 30 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 31 | import org.apache.hadoop.io.compress.Decompressor; 32 | import org.apache.hadoop.mapreduce.InputSplit; 33 | import org.apache.hadoop.mapreduce.JobContext; 34 | import org.apache.hadoop.mapreduce.RecordReader; 35 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 37 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 38 | import org.clueweb.clueweb12.ClueWeb12WarcRecord; 39 | 40 | public class ClueWeb12InputFormat extends FileInputFormat { 41 | @Override 42 | public RecordReader createRecordReader(InputSplit split, 43 | TaskAttemptContext context) throws IOException, InterruptedException { 44 | return new ClueWarcRecordReader(); 45 | } 46 | 47 | @Override 48 | protected boolean isSplitable(JobContext context, Path filename) { 49 | return false; 50 | } 51 | 52 | public class ClueWarcRecordReader extends RecordReader { 53 | private CompressionCodecFactory compressionCodecs = null; 54 | private long start; 55 | private long pos; 56 | private long end; 57 | private LongWritable key = null; 58 | private ClueWeb12WarcRecord value = null; 59 | private Seekable filePosition; 60 | private CompressionCodec codec; 61 | private Decompressor decompressor; 62 | private DataInputStream in; 63 | 64 | public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 65 | FileSplit split = (FileSplit) genericSplit; 66 | Configuration job = context.getConfiguration(); 67 | start = split.getStart(); 68 | end = start + split.getLength(); 69 | final Path file = split.getPath(); 70 | compressionCodecs = new CompressionCodecFactory(job); 71 | codec = compressionCodecs.getCodec(file); 72 | 73 | // open the file and seek to the start of the split 74 | FileSystem fs = file.getFileSystem(job); 75 | FSDataInputStream fileIn = fs.open(split.getPath()); 76 | 77 | if (isCompressedInput()) { 78 | in = new DataInputStream(codec.createInputStream(fileIn, decompressor)); 79 | filePosition = fileIn; 80 | } else { 81 | fileIn.seek(start); 82 | in = fileIn; 83 | filePosition = fileIn; 84 | } 85 | 86 | this.pos = start; 87 | } 88 | 89 | private boolean isCompressedInput() { 90 | return (codec != null); 91 | } 92 | 93 | private long getFilePosition() throws IOException { 94 | long retVal; 95 | if (isCompressedInput() && null != filePosition) { 96 | retVal = filePosition.getPos(); 97 | } else { 98 | retVal = pos; 99 | } 100 | return retVal; 101 | } 102 | 103 | public boolean nextKeyValue() throws IOException { 104 | if (key == null) { 105 | key = new LongWritable(); 106 | } 107 | key.set(pos); 108 | 109 | value = ClueWeb12WarcRecord.readNextWarcRecord(in); 110 | if (value == null) { 111 | return false; 112 | } 113 | return true; 114 | } 115 | 116 | @Override 117 | public LongWritable getCurrentKey() { 118 | return key; 119 | } 120 | 121 | @Override 122 | public ClueWeb12WarcRecord getCurrentValue() { 123 | return value; 124 | } 125 | 126 | /** 127 | * Get the progress within the split 128 | */ 129 | public float getProgress() throws IOException { 130 | if (start == end) { 131 | return 0.0f; 132 | } else { 133 | return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start)); 134 | } 135 | } 136 | 137 | public synchronized void close() throws IOException { 138 | try { 139 | if (in != null) { 140 | in.close(); 141 | } 142 | } finally { 143 | if (decompressor != null) { 144 | CodecPool.returnDecompressor(decompressor); 145 | } 146 | } 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/DocVector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | public interface DocVector { 20 | int[] getTermIds(); 21 | int getLength(); 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/Indexable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | import org.apache.hadoop.io.Writable; 20 | 21 | /** 22 | * A document that can be indexed. 23 | */ 24 | public abstract class Indexable implements Writable { 25 | 26 | /** 27 | * Returns the globally-unique String identifier of the document within the collection. 28 | * 29 | * @return docid of the document 30 | */ 31 | public abstract String getDocid(); 32 | 33 | /** 34 | * Returns the content of the document. 35 | * 36 | * @return content of the document 37 | */ 38 | public abstract String getContent(); 39 | 40 | /** 41 | * Returns the content of the document for display to a human. 42 | * 43 | * @return displayable content 44 | */ 45 | public String getDisplayContent() { 46 | return getContent(); 47 | } 48 | 49 | /** 50 | * Returns the type of the display content, per IANA MIME Media Type (e.g., "text/html"). 51 | * See {@code http://www.iana.org/assignments/media-types/index.html} 52 | * 53 | * @return IANA MIME Media Type 54 | */ 55 | public String getDisplayContentType() { 56 | return "text/plain"; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/PForDocVector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | import me.lemire.integercompression.FastPFOR; 20 | import me.lemire.integercompression.IntWrapper; 21 | import me.lemire.integercompression.VariableByte; 22 | import tl.lin.data.array.IntArrayWritable; 23 | 24 | public class PForDocVector { 25 | private static final FastPFOR P4 = new FastPFOR(); 26 | private static final VariableByte VB = new VariableByte(); 27 | 28 | private int[] termids; 29 | 30 | public PForDocVector() {} 31 | 32 | public int[] getTermIds() { 33 | return termids; 34 | } 35 | 36 | public int getLength() { 37 | return termids.length; 38 | } 39 | 40 | public static void fromIntArrayWritable(IntArrayWritable in, PForDocVector doc) { 41 | try { 42 | int[] compressed = in.getArray(); 43 | IntWrapper inPos = new IntWrapper(1); 44 | IntWrapper outPos = new IntWrapper(0); 45 | doc.termids = new int[compressed[0]]; 46 | 47 | if (doc.termids.length == 0) { 48 | return; 49 | } 50 | 51 | if (doc.termids.length < 128) { 52 | VB.uncompress(compressed, inPos, in.size()-1, doc.termids, outPos); 53 | return; 54 | } 55 | 56 | // For this, the zero doesn't matter. 57 | P4.uncompress(compressed, inPos, 0, doc.termids, outPos); 58 | 59 | if (doc.termids.length % 128 == 0) { 60 | return; 61 | } 62 | 63 | // Decode whatever is left over. 64 | VB.uncompress(compressed, inPos, in.size() - inPos.get(), doc.termids, outPos); 65 | } catch (Exception e) { 66 | e.printStackTrace(); 67 | doc.termids = new int[0]; 68 | } 69 | } 70 | 71 | public static void toIntArrayWritable(IntArrayWritable ints, int[] termids, int length) { 72 | // Remember, the number of terms to serialize is length; the array might be longer. 73 | try { 74 | if (termids == null) { 75 | termids = new int[] {}; 76 | length = 0; 77 | } 78 | 79 | IntWrapper inPos = new IntWrapper(0); 80 | IntWrapper outPos = new IntWrapper(1); 81 | 82 | int[] out = new int[length + 1]; 83 | out[0] = length; 84 | 85 | if (length < 128) { 86 | VB.compress(termids, inPos, length, out, outPos); 87 | ints.setArray(out, outPos.get()); 88 | 89 | return; 90 | } 91 | 92 | P4.compress(termids, inPos, (length/128)*128, out, outPos); 93 | 94 | if (length % 128 == 0) { 95 | ints.setArray(out, outPos.get()); 96 | return; 97 | } 98 | 99 | VB.compress(termids, inPos, length % 128, out, outPos); 100 | ints.setArray(out, outPos.get()); 101 | } catch (Exception e) { 102 | e.printStackTrace(); 103 | ints.setArray(new int[] {}, 0); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/TermStatistics.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.fs.FSDataInputStream; 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.io.WritableUtils; 26 | import org.clueweb.clueweb12.app.BuildDictionary; 27 | 28 | import com.google.common.base.Preconditions; 29 | 30 | public class TermStatistics { 31 | private final int numTerms; 32 | private final long[] cfs; 33 | private final int[] dfs; 34 | 35 | private long collectionSize; 36 | 37 | private long maxCf = 0; 38 | private int maxCfTerm; 39 | 40 | private int maxDf = 0; 41 | private int maxDfTerm; 42 | 43 | /** 44 | * Creates a {@code CfTable} object. 45 | * 46 | * @param file collection frequency data file 47 | * @throws IOException 48 | */ 49 | public TermStatistics(Path file) throws IOException { 50 | this(file, FileSystem.get(new Configuration())); 51 | } 52 | 53 | /** 54 | * Creates a {@code CfTable} object. 55 | * 56 | * @param file collection frequency data file 57 | * @param fs FileSystem to read from 58 | * @throws IOException 59 | */ 60 | public TermStatistics(Path file, FileSystem fs) throws IOException { 61 | Preconditions.checkNotNull(file); 62 | Preconditions.checkNotNull(fs); 63 | 64 | FSDataInputStream in = fs.open(new Path(file, BuildDictionary.CF_BY_ID_DATA)); 65 | this.numTerms = in.readInt(); 66 | 67 | cfs = new long[numTerms]; 68 | 69 | for (int i = 0; i < numTerms; i++) { 70 | long cf = WritableUtils.readVLong(in); 71 | 72 | cfs[i] = cf; 73 | collectionSize += cf; 74 | 75 | if (cf > maxCf) { 76 | maxCf = cf; 77 | maxCfTerm = i + 1; 78 | } 79 | } 80 | 81 | in.close(); 82 | 83 | in = fs.open(new Path(file, BuildDictionary.DF_BY_ID_DATA)); 84 | if (numTerms != in.readInt() ) { 85 | throw new IOException("df data and cf data should have the same number of entries!"); 86 | } 87 | 88 | dfs = new int[numTerms]; 89 | 90 | for (int i = 0; i < numTerms; i++) { 91 | int df = WritableUtils.readVInt(in); 92 | 93 | dfs[i] = df; 94 | 95 | if (df > maxDf) { 96 | maxDf = df; 97 | maxDfTerm = i + 1; 98 | } 99 | } 100 | 101 | in.close(); 102 | } 103 | 104 | public int getDf(int term) { 105 | if (term <= 0 || term > numTerms) { 106 | return 0; 107 | } 108 | return dfs[term - 1]; 109 | } 110 | 111 | public long getCf(int term) { 112 | if (term <= 0 || term > numTerms) { 113 | return 0; 114 | } 115 | 116 | return cfs[term - 1]; 117 | } 118 | 119 | public long getCollectionSize() { 120 | return collectionSize; 121 | } 122 | 123 | public int getVocabularySize() { 124 | return numTerms; 125 | } 126 | 127 | public int getMaxDf() { 128 | return maxDf; 129 | } 130 | 131 | public long getMaxCf() { 132 | return maxCf; 133 | } 134 | 135 | public int getMaxDfTerm() { 136 | return maxDfTerm; 137 | } 138 | 139 | public int getMaxCfTerm() { 140 | return maxCfTerm; 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/VByteDocVector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | import java.io.ByteArrayInputStream; 20 | import java.io.ByteArrayOutputStream; 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | 25 | import org.apache.hadoop.io.BytesWritable; 26 | import org.apache.hadoop.io.WritableUtils; 27 | 28 | public class VByteDocVector implements DocVector { 29 | private int[] termids; 30 | 31 | public VByteDocVector() {} 32 | 33 | public int[] getTermIds() { 34 | return termids; 35 | } 36 | 37 | public int getLength() { 38 | return termids.length; 39 | } 40 | 41 | public static void fromBytesWritable(BytesWritable bytes, VByteDocVector doc) { 42 | try { 43 | ByteArrayInputStream bytesIn = new ByteArrayInputStream(bytes.getBytes()); 44 | DataInputStream data = new DataInputStream(bytesIn); 45 | 46 | int length = WritableUtils.readVInt(data); 47 | doc.termids = new int[length]; 48 | for (int i = 0; i < length; i++) { 49 | doc.termids[i] = WritableUtils.readVInt(data); 50 | } 51 | } catch (IOException e) { 52 | doc.termids = new int[0]; 53 | } 54 | } 55 | 56 | public static void toBytesWritable(BytesWritable bytes, int[] termids, int length) { 57 | try { 58 | if (termids == null) { 59 | termids = new int[] {}; 60 | length = 0; 61 | } 62 | 63 | ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); 64 | DataOutputStream dataOut = new DataOutputStream(bytesOut); 65 | 66 | WritableUtils.writeVInt(dataOut, length); 67 | for (int i = 0; i < length; i++) { 68 | WritableUtils.writeVInt(dataOut, termids[i]); 69 | } 70 | 71 | byte[] raw = bytesOut.toByteArray(); 72 | bytes.set(raw, 0, raw.length); 73 | } catch (IOException e) { 74 | bytes.set(new byte[] {}, 0, 0); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/data/WarcTrecIdMapping.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.data; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.fs.FileSystem; 23 | import org.apache.hadoop.fs.Path; 24 | import org.apache.log4j.Logger; 25 | import org.apache.lucene.document.Document; 26 | import org.apache.lucene.index.DirectoryReader; 27 | import org.apache.lucene.index.IndexReader; 28 | import org.apache.lucene.index.Term; 29 | import org.apache.lucene.search.IndexSearcher; 30 | import org.apache.lucene.search.Query; 31 | import org.apache.lucene.search.TermQuery; 32 | import org.apache.lucene.search.TopDocs; 33 | import org.apache.lucene.store.Directory; 34 | 35 | import tl.lin.lucene.FileSystemDirectory; 36 | 37 | public class WarcTrecIdMapping { 38 | private static final Logger LOG = Logger.getLogger(WarcTrecIdMapping.class); 39 | 40 | public static enum IndexField { 41 | WARC_TREC_ID("WARC-TREC-ID"); 42 | 43 | public final String name; 44 | 45 | IndexField(String s) { 46 | name = s; 47 | } 48 | }; 49 | 50 | private IndexReader reader; 51 | private IndexSearcher searcher; 52 | 53 | public WarcTrecIdMapping(Path indexLocation, Configuration conf) throws IOException { 54 | FileSystem fs = FileSystem.getLocal(conf); 55 | Directory directory = new FileSystemDirectory(fs, indexLocation, false, conf); 56 | 57 | LOG.info("Opening index " + indexLocation); 58 | reader = DirectoryReader.open(directory); 59 | searcher = new IndexSearcher(reader); 60 | } 61 | 62 | public int getDocno(String id) { 63 | Query query = new TermQuery(new Term(IndexField.WARC_TREC_ID.name, id)); 64 | 65 | TopDocs rs; 66 | try { 67 | rs = searcher.search(query, 1); 68 | if (rs.totalHits != 1) { 69 | return -1; 70 | } 71 | 72 | return rs.scoreDocs[0].doc; 73 | } catch (IOException e) { 74 | e.printStackTrace(); 75 | } 76 | 77 | return -1; 78 | } 79 | 80 | public String getDocid(int docno) { 81 | if (docno >= reader.maxDoc()) { 82 | return null; 83 | } 84 | try { 85 | Document d = reader.document(docno); 86 | if (d == null) { 87 | return null; 88 | } 89 | return d.getField(IndexField.WARC_TREC_ID.name).stringValue(); 90 | } catch (IOException e) { 91 | e.printStackTrace(); 92 | } 93 | return null; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/DefaultFrequencySortedDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.io.PrintStream; 23 | import java.util.Iterator; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.fs.FSDataInputStream; 27 | import org.apache.hadoop.fs.FileSystem; 28 | import org.apache.hadoop.fs.Path; 29 | import org.clueweb.clueweb12.app.BuildDictionary; 30 | import org.clueweb.data.TermStatistics; 31 | 32 | /** 33 | * An implementation of {@link FrequencySortedDictionary}. Term ids start at 1, which corresponds to 34 | * the most frequent term. Term id 2 is the second most frequent term, etc. 35 | * 36 | * @author Jimmy Lin 37 | */ 38 | public class DefaultFrequencySortedDictionary implements FrequencySortedDictionary { 39 | private FrontCodedDictionary dictionary = new FrontCodedDictionary(); 40 | private int[] ids; 41 | private int[] idsToTerm; 42 | 43 | /** 44 | * Constructs an instance of this dictionary from serialized data files. 45 | */ 46 | public DefaultFrequencySortedDictionary(String basePath, FileSystem fs) throws IOException { 47 | FSDataInputStream in; 48 | 49 | in = fs.open(new Path(basePath, BuildDictionary.TERMS_DATA)); 50 | dictionary.readFields(in); 51 | in.close(); 52 | 53 | int l = 0; 54 | 55 | in = fs.open(new Path(basePath, BuildDictionary.TERMS_ID_DATA)); 56 | l = in.readInt(); 57 | ids = new int[l]; 58 | for (int i = 0; i < l; i++) { 59 | ids[i] = in.readInt(); 60 | } 61 | in.close(); 62 | 63 | in = fs.open(new Path(basePath, BuildDictionary.TERMS_ID_MAPPING_DATA)); 64 | l = in.readInt(); 65 | idsToTerm = new int[l]; 66 | for (int i = 0; i < l; i++) { 67 | idsToTerm[i] = in.readInt(); 68 | } 69 | in.close(); 70 | } 71 | 72 | @Override 73 | public int size() { 74 | return ids.length; 75 | } 76 | 77 | @Override 78 | public int getId(String term) { 79 | int index = dictionary.getId(term); 80 | 81 | if (index < 0) { 82 | return -1; 83 | } 84 | 85 | return ids[index]; 86 | } 87 | 88 | @Override 89 | public String getTerm(int id) { 90 | if (id > ids.length || id == 0 || idsToTerm == null) { 91 | return null; 92 | } 93 | String term = dictionary.getTerm(idsToTerm[id - 1]); 94 | 95 | return term; 96 | } 97 | 98 | /** 99 | * Returns an iterator over the dictionary in order of term id. 100 | */ 101 | @Override 102 | public Iterator iterator() { 103 | return new Iterator() { 104 | private int cur = 1; 105 | final private int end = dictionary.size(); 106 | 107 | @Override 108 | public boolean hasNext() { 109 | return cur < end + 1; 110 | } 111 | 112 | @Override 113 | public String next() { 114 | return getTerm(cur++); 115 | } 116 | 117 | @Override 118 | public void remove() { 119 | throw new UnsupportedOperationException(); 120 | } 121 | }; 122 | } 123 | 124 | /** 125 | * Simple demo program for looking up terms and term ids. 126 | */ 127 | public static void main(String[] args) throws Exception { 128 | if (args.length != 1) { 129 | System.err.println("usage: [index-path]"); 130 | System.exit(-1); 131 | } 132 | 133 | String path = args[0]; 134 | 135 | PrintStream out = new PrintStream(System.out, true, "UTF-8"); 136 | 137 | Configuration conf = new Configuration(); 138 | FileSystem fs = FileSystem.get(conf); 139 | 140 | DefaultFrequencySortedDictionary dictionary = 141 | new DefaultFrequencySortedDictionary(path, fs); 142 | 143 | int nTerms = dictionary.size(); 144 | out.println("number of terms: " + nTerms); 145 | 146 | TermStatistics stats = new TermStatistics(new Path(path), fs); 147 | out.println("max df = " + stats.getMaxDf() + ", termid " + stats.getMaxDfTerm()); 148 | out.println("max cf = " + stats.getMaxCf() + ", termid " + stats.getMaxCfTerm()); 149 | out.println("collection size = " + stats.getCollectionSize()); 150 | out.println(""); 151 | 152 | out.println(" \"term word\" to lookup termid; \"termid 234\" to lookup term"); 153 | String cmd = null; 154 | BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); 155 | out.print("lookup > "); 156 | while ((cmd = stdin.readLine()) != null) { 157 | 158 | String[] tokens = cmd.split("\\s+"); 159 | 160 | if (tokens.length != 2) { 161 | out.println("Error: unrecognized command!"); 162 | out.print("lookup > "); 163 | 164 | continue; 165 | } 166 | 167 | if (tokens[0].equals("termid")) { 168 | int termid; 169 | try { 170 | termid = Integer.parseInt(tokens[1]); 171 | } catch (Exception e) { 172 | out.println("Error: invalid termid!"); 173 | out.print("lookup > "); 174 | 175 | continue; 176 | } 177 | 178 | out.println("termid=" + termid + ", term=" + dictionary.getTerm(termid)); 179 | out.println(" df = " + stats.getDf(termid) + ", cf = " + stats.getCf(termid)); 180 | } else if (tokens[0].equals("term")) { 181 | String term = tokens[1]; 182 | 183 | out.println("term=" + term + ", termid=" + dictionary.getId(term)); 184 | out.println(" df = " + stats.getDf(dictionary.getId(term)) + 185 | ", cf = " + stats.getCf(dictionary.getId(term))); 186 | } else { 187 | out.println("Error: unrecognized command!"); 188 | out.print("lookup > "); 189 | continue; 190 | } 191 | 192 | out.print("lookup > "); 193 | } 194 | out.close(); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/Dictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | /** 20 | * A dictionary provides a bidirectional mapping terms (Strings) and term ids (integers). The 21 | * semantics of the mapping is left unspecified, but the iteration order is always in 22 | * increasing term id. 23 | * 24 | * @author Jimmy Lin 25 | */ 26 | public interface Dictionary extends Iterable { 27 | /** 28 | * Returns the term associated with this term id. 29 | * 30 | * @param id term id 31 | * @return term associated with this term id 32 | */ 33 | String getTerm(int id); 34 | 35 | /** 36 | * Returns the id associated with this term. 37 | * 38 | * @param term term 39 | * @return id associated with this term 40 | */ 41 | int getId(String term); 42 | 43 | /** 44 | * Returns the size of this dictionary. 45 | * 46 | * @return number of terms in this dictionary 47 | */ 48 | int size(); 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/DictionaryTransformationStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | import it.unimi.dsi.bits.TransformationStrategies; 20 | import it.unimi.dsi.bits.TransformationStrategy; 21 | 22 | import java.nio.charset.CharacterCodingException; 23 | 24 | import org.apache.hadoop.io.Text; 25 | import org.apache.hadoop.io.WritableUtils; 26 | 27 | public class DictionaryTransformationStrategy { 28 | public static TransformationStrategy getStrategy() { 29 | return TransformationStrategies.prefixFreeUtf16(); 30 | } 31 | 32 | public static class WritableComparator extends org.apache.hadoop.io.WritableComparator { 33 | private final TransformationStrategy strategy = 34 | DictionaryTransformationStrategy.getStrategy(); 35 | 36 | public WritableComparator() { 37 | super(Text.class); 38 | } 39 | 40 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 41 | int n1 = WritableUtils.decodeVIntSize(b1[s1]); 42 | int n2 = WritableUtils.decodeVIntSize(b2[s2]); 43 | 44 | String t1=null, t2=null; 45 | try { 46 | t1 = Text.decode(b1, s1+n1, l1-n1); 47 | t2 = Text.decode(b2, s2+n2, l2-n2); 48 | } catch (CharacterCodingException e) { 49 | throw new RuntimeException(e); 50 | } 51 | 52 | return strategy.toBitVector(t1).compareTo(strategy.toBitVector(t2)); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/FrequencySortedDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | /** 20 | * A frequency-sorted dictionary. That is, smaller term ids are assigned to more 21 | * frequently occurring terms. 22 | * 23 | * @author Jimmy Lin 24 | */ 25 | public interface FrequencySortedDictionary extends Dictionary {} 26 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/FrontCodedDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | import it.unimi.dsi.util.FrontCodedStringList; 20 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap; 21 | 22 | import java.io.BufferedReader; 23 | import java.io.ByteArrayInputStream; 24 | import java.io.DataInput; 25 | import java.io.DataOutput; 26 | import java.io.IOException; 27 | import java.io.InputStreamReader; 28 | import java.io.ObjectInputStream; 29 | import java.util.Iterator; 30 | 31 | import org.apache.hadoop.conf.Configuration; 32 | import org.apache.hadoop.fs.FileSystem; 33 | import org.apache.hadoop.fs.Path; 34 | import org.apache.hadoop.io.Writable; 35 | import org.apache.log4j.Logger; 36 | 37 | public class FrontCodedDictionary implements Writable, LexicographicallySortedDictionary { 38 | private static final Logger LOG = Logger.getLogger(FrontCodedDictionary.class); 39 | 40 | private FrontCodedStringList stringList; 41 | private ShiftAddXorSignedStringMap dictionary; 42 | 43 | public FrontCodedDictionary() {} 44 | 45 | @Override 46 | public int getId(String term) { 47 | return (int) dictionary.getLong(term); 48 | } 49 | 50 | @Override 51 | public String getTerm(int id) { 52 | return stringList.get(id).toString(); 53 | } 54 | 55 | @Override 56 | public int size() { 57 | return stringList.size(); 58 | } 59 | 60 | @Override 61 | public Iterator iterator() { 62 | return null; 63 | } 64 | 65 | @Override 66 | public void readFields(final DataInput in) throws IOException { 67 | byte[] bytes; 68 | ObjectInputStream obj; 69 | 70 | bytes = new byte[in.readInt()]; 71 | LOG.info("Loading front-coded list of terms: " + bytes.length + " bytes."); 72 | in.readFully(bytes); 73 | obj = new ObjectInputStream(new ByteArrayInputStream(bytes)); 74 | try { 75 | stringList = (FrontCodedStringList) obj.readObject(); 76 | } catch (ClassNotFoundException e) { 77 | throw new RuntimeException(e); 78 | } 79 | obj.close(); 80 | 81 | bytes = new byte[in.readInt()]; 82 | LOG.info("Loading dictionary hash: " + bytes.length + " bytes."); 83 | in.readFully(bytes); 84 | obj = new ObjectInputStream(new ByteArrayInputStream(bytes)); 85 | try { 86 | dictionary = (ShiftAddXorSignedStringMap) obj.readObject(); 87 | } catch (ClassNotFoundException e) { 88 | throw new RuntimeException(e); 89 | } 90 | obj.close(); 91 | LOG.info("Finished loading."); 92 | } 93 | 94 | @Override 95 | public void write(DataOutput out) throws IOException { 96 | } 97 | 98 | /** 99 | * Simple demo program for looking up terms and term ids. 100 | */ 101 | public static void main(String[] args) throws Exception { 102 | if (args.length != 1) { 103 | System.out.println("usage: [index-path]"); 104 | System.exit(-1); 105 | } 106 | 107 | String indexPath = args[0]; 108 | 109 | Configuration conf = new Configuration(); 110 | FileSystem fs = FileSystem.get(conf); 111 | 112 | FrontCodedDictionary dictionary = new FrontCodedDictionary(); 113 | dictionary.readFields(fs.open(new Path(indexPath))); 114 | 115 | int nTerms = dictionary.size(); 116 | System.out.println("nTerms: " + nTerms); 117 | 118 | System.out.println(" \"term word\" to lookup termid; \"termid 234\" to lookup term"); 119 | String cmd = null; 120 | BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); 121 | System.out.print("lookup > "); 122 | while ((cmd = stdin.readLine()) != null) { 123 | 124 | String[] tokens = cmd.split("\\s+"); 125 | 126 | if (tokens.length != 2) { 127 | System.out.println("Error: unrecognized command!"); 128 | System.out.print("lookup > "); 129 | 130 | continue; 131 | } 132 | 133 | if (tokens[0].equals("termid")) { 134 | int termid; 135 | try { 136 | termid = Integer.parseInt(tokens[1]); 137 | } catch (Exception e) { 138 | System.out.println("Error: invalid termid!"); 139 | System.out.print("lookup > "); 140 | 141 | continue; 142 | } 143 | 144 | System.out.println("termid=" + termid + ", term=" + dictionary.getTerm(termid)); 145 | } else if (tokens[0].equals("term")) { 146 | String term = tokens[1]; 147 | 148 | System.out.println("term=" + term + ", termid=" + dictionary.getId(term)); 149 | } else { 150 | System.out.println("Error: unrecognized command!"); 151 | System.out.print("lookup > "); 152 | continue; 153 | } 154 | 155 | System.out.print("lookup > "); 156 | } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/LexicographicallySortedDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.dictionary; 18 | 19 | /** 20 | * A lexicographically-sorted dictionary. That is, smaller term ids correspond to terms 21 | * that are sorted lexicographically earlier. 22 | * 23 | * @author Jimmy Lin 24 | */ 25 | public interface LexicographicallySortedDictionary extends Dictionary {} 26 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/dictionary/PorterAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.clueweb.dictionary; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.core.LowerCaseFilter; 8 | import org.apache.lucene.analysis.core.StopFilter; 9 | import org.apache.lucene.analysis.en.PorterStemFilter; 10 | import org.apache.lucene.analysis.standard.StandardFilter; 11 | import org.apache.lucene.analysis.standard.StandardTokenizer; 12 | import org.apache.lucene.analysis.util.CharArraySet; 13 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase; 14 | import org.apache.lucene.util.Version; 15 | 16 | import com.google.common.collect.Lists; 17 | 18 | /** 19 | * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter}, 20 | * {@link StopFilter}, and {@link PorterStemFilter}. 21 | */ 22 | public final class PorterAnalyzer extends StopwordAnalyzerBase { 23 | 24 | // Stopwords from Terrier v3.5. 25 | static final String[] STOPWORDS = { 26 | "a", 27 | "abaft", 28 | "abafter", 29 | "abaftest", 30 | "about", 31 | "abouter", 32 | "aboutest", 33 | "above", 34 | "abover", 35 | "abovest", 36 | "accordingly", 37 | "aer", 38 | "aest", 39 | "afore", 40 | "after", 41 | "afterer", 42 | "afterest", 43 | "afterward", 44 | "afterwards", 45 | "again", 46 | "against", 47 | "aid", 48 | "ain", 49 | "albeit", 50 | "all", 51 | "aller", 52 | "allest", 53 | "alls", 54 | "allyou", 55 | "almost", 56 | "along", 57 | "alongside", 58 | "already", 59 | "also", 60 | "although", 61 | "always", 62 | "amid", 63 | "amidst", 64 | "among", 65 | "amongst", 66 | "an", 67 | "and", 68 | "andor", 69 | "anear", 70 | "anent", 71 | "another", 72 | "any", 73 | "anybody", 74 | "anyhow", 75 | "anyone", 76 | "anything", 77 | "anywhere", 78 | "apart", 79 | "aparter", 80 | "apartest", 81 | "appear", 82 | "appeared", 83 | "appearing", 84 | "appears", 85 | "appropriate", 86 | "appropriated", 87 | "appropriater", 88 | "appropriates", 89 | "appropriatest", 90 | "appropriating", 91 | "are", 92 | "ares", 93 | "around", 94 | "as", 95 | "ases", 96 | "aside", 97 | "asides", 98 | "aslant", 99 | "astraddle", 100 | "astraddler", 101 | "astraddlest", 102 | "astride", 103 | "astrider", 104 | "astridest", 105 | "at", 106 | "athwart", 107 | "atop", 108 | "atween", 109 | "aught", 110 | "aughts", 111 | "available", 112 | "availabler", 113 | "availablest", 114 | "awfully", 115 | "b", 116 | "be", 117 | "became", 118 | "because", 119 | "become", 120 | "becomes", 121 | "becoming", 122 | "becominger", 123 | "becomingest", 124 | "becomings", 125 | "been", 126 | "before", 127 | "beforehand", 128 | "beforehander", 129 | "beforehandest", 130 | "behind", 131 | "behinds", 132 | "below", 133 | "beneath", 134 | "beside", 135 | "besides", 136 | "better", 137 | "bettered", 138 | "bettering", 139 | "betters", 140 | "between", 141 | "betwixt", 142 | "beyond", 143 | "bist", 144 | "both", 145 | "but", 146 | "buts", 147 | "by", 148 | "by-and-by", 149 | "byandby", 150 | "c", 151 | "cannot", 152 | "canst", 153 | "cant", 154 | "canted", 155 | "cantest", 156 | "canting", 157 | "cants", 158 | "cer", 159 | "certain", 160 | "certainer", 161 | "certainest", 162 | "cest", 163 | "chez", 164 | "circa", 165 | "co", 166 | "come-on", 167 | "come-ons", 168 | "comeon", 169 | "comeons", 170 | "concerning", 171 | "concerninger", 172 | "concerningest", 173 | "consequently", 174 | "considering", 175 | "could", 176 | "couldst", 177 | "cum", 178 | "d", 179 | "dday", 180 | "ddays", 181 | "describe", 182 | "described", 183 | "describes", 184 | "describing", 185 | "despite", 186 | "despited", 187 | "despites", 188 | "despiting", 189 | "did", 190 | "different", 191 | "differenter", 192 | "differentest", 193 | "do", 194 | "doe", 195 | "does", 196 | "doing", 197 | "doings", 198 | "done", 199 | "doner", 200 | "dones", 201 | "donest", 202 | "dos", 203 | "dost", 204 | "doth", 205 | "downs", 206 | "downward", 207 | "downwarder", 208 | "downwardest", 209 | "downwards", 210 | "during", 211 | "e", 212 | "each", 213 | "eg", 214 | "eight", 215 | "either", 216 | "else", 217 | "elsewhere", 218 | "enough", 219 | "ere", 220 | "et", 221 | "etc", 222 | "even", 223 | "evened", 224 | "evenest", 225 | "evens", 226 | "evenser", 227 | "evensest", 228 | "ever", 229 | "every", 230 | "everybody", 231 | "everyone", 232 | "everything", 233 | "everywhere", 234 | "ex", 235 | "except", 236 | "excepted", 237 | "excepting", 238 | "excepts", 239 | "exes", 240 | "f", 241 | "fact", 242 | "facts", 243 | "failing", 244 | "failings", 245 | "few", 246 | "fewer", 247 | "fewest", 248 | "figupon", 249 | "figuponed", 250 | "figuponing", 251 | "figupons", 252 | "five", 253 | "followthrough", 254 | "for", 255 | "forby", 256 | "forbye", 257 | "fore", 258 | "forer", 259 | "fores", 260 | "forever", 261 | "former", 262 | "formerer", 263 | "formerest", 264 | "formerly", 265 | "formers", 266 | "fornenst", 267 | "forwhy", 268 | "four", 269 | "fourscore", 270 | "frae", 271 | "from", 272 | "fs", 273 | "further", 274 | "furthered", 275 | "furtherer", 276 | "furtherest", 277 | "furthering", 278 | "furthermore", 279 | "furthers", 280 | "g", 281 | "get", 282 | "gets", 283 | "getting", 284 | "go", 285 | "gone", 286 | "good", 287 | "got", 288 | "gotta", 289 | "gotten", 290 | "h", 291 | "had", 292 | "hadst", 293 | "hae", 294 | "hardly", 295 | "has", 296 | "hast", 297 | "hath", 298 | "have", 299 | "haves", 300 | "having", 301 | "he", 302 | "hence", 303 | "her", 304 | "hereafter", 305 | "hereafters", 306 | "hereby", 307 | "herein", 308 | "hereupon", 309 | "hers", 310 | "herself", 311 | "him", 312 | "himself", 313 | "his", 314 | "hither", 315 | "hitherer", 316 | "hitherest", 317 | "hoo", 318 | "hoos", 319 | "how", 320 | "how-do-you-do", 321 | "howbeit", 322 | "howdoyoudo", 323 | "however", 324 | "huh", 325 | "humph", 326 | "i", 327 | "idem", 328 | "idemer", 329 | "idemest", 330 | "ie", 331 | "if", 332 | "ifs", 333 | "immediate", 334 | "immediately", 335 | "immediater", 336 | "immediatest", 337 | "in", 338 | "inasmuch", 339 | "inc", 340 | "indeed", 341 | "indicate", 342 | "indicated", 343 | "indicates", 344 | "indicating", 345 | "info", 346 | "information", 347 | "insofar", 348 | "instead", 349 | "into", 350 | "inward", 351 | "inwarder", 352 | "inwardest", 353 | "inwards", 354 | "is", 355 | "it", 356 | "its", 357 | "itself", 358 | "j", 359 | "k", 360 | "l", 361 | "latter", 362 | "latterer", 363 | "latterest", 364 | "latterly", 365 | "latters", 366 | "layabout", 367 | "layabouts", 368 | "less", 369 | "lest", 370 | "lot", 371 | "lots", 372 | "lotted", 373 | "lotting", 374 | "m", 375 | "main", 376 | "make", 377 | "many", 378 | "mauger", 379 | "maugre", 380 | "mayest", 381 | "me", 382 | "meanwhile", 383 | "meanwhiles", 384 | "midst", 385 | "midsts", 386 | "might", 387 | "mights", 388 | "more", 389 | "moreover", 390 | "most", 391 | "mostly", 392 | "much", 393 | "mucher", 394 | "muchest", 395 | "must", 396 | "musth", 397 | "musths", 398 | "musts", 399 | "my", 400 | "myself", 401 | "n", 402 | "natheless", 403 | "nathless", 404 | "neath", 405 | "neaths", 406 | "necessarier", 407 | "necessariest", 408 | "necessary", 409 | "neither", 410 | "nethe", 411 | "nethermost", 412 | "never", 413 | "nevertheless", 414 | "nigh", 415 | "nigher", 416 | "nighest", 417 | "nine", 418 | "no", 419 | "no-one", 420 | "nobodies", 421 | "nobody", 422 | "noes", 423 | "none", 424 | "noone", 425 | "nor", 426 | "nos", 427 | "not", 428 | "nothing", 429 | "nothings", 430 | "notwithstanding", 431 | "nowhere", 432 | "nowheres", 433 | "o", 434 | "of", 435 | "off", 436 | "offest", 437 | "offs", 438 | "often", 439 | "oftener", 440 | "oftenest", 441 | "oh", 442 | "on", 443 | "one", 444 | "oneself", 445 | "onest", 446 | "ons", 447 | "onto", 448 | "or", 449 | "orer", 450 | "orest", 451 | "other", 452 | "others", 453 | "otherwise", 454 | "otherwiser", 455 | "otherwisest", 456 | "ought", 457 | "oughts", 458 | "our", 459 | "ours", 460 | "ourself", 461 | "ourselves", 462 | "out", 463 | "outed", 464 | "outest", 465 | "outs", 466 | "outside", 467 | "outwith", 468 | "over", 469 | "overall", 470 | "overaller", 471 | "overallest", 472 | "overalls", 473 | "overs", 474 | "own", 475 | "owned", 476 | "owning", 477 | "owns", 478 | "owt", 479 | "p", 480 | "particular", 481 | "particularer", 482 | "particularest", 483 | "particularly", 484 | "particulars", 485 | "per", 486 | "perhaps", 487 | "plaintiff", 488 | "please", 489 | "pleased", 490 | "pleases", 491 | "plenties", 492 | "plenty", 493 | "pro", 494 | "probably", 495 | "provide", 496 | "provided", 497 | "provides", 498 | "providing", 499 | "q", 500 | "qua", 501 | "que", 502 | "quite", 503 | "r", 504 | "rath", 505 | "rathe", 506 | "rather", 507 | "rathest", 508 | "re", 509 | "really", 510 | "regarding", 511 | "relate", 512 | "related", 513 | "relatively", 514 | "res", 515 | "respecting", 516 | "respectively", 517 | "s", 518 | "said", 519 | "saider", 520 | "saidest", 521 | "same", 522 | "samer", 523 | "sames", 524 | "samest", 525 | "sans", 526 | "sanserif", 527 | "sanserifs", 528 | "sanses", 529 | "saved", 530 | "sayid", 531 | "sayyid", 532 | "seem", 533 | "seemed", 534 | "seeminger", 535 | "seemingest", 536 | "seemings", 537 | "seems", 538 | "send", 539 | "sent", 540 | "senza", 541 | "serious", 542 | "seriouser", 543 | "seriousest", 544 | "seven", 545 | "several", 546 | "severaler", 547 | "severalest", 548 | "shall", 549 | "shalled", 550 | "shalling", 551 | "shalls", 552 | "she", 553 | "should", 554 | "shoulded", 555 | "shoulding", 556 | "shoulds", 557 | "since", 558 | "sine", 559 | "sines", 560 | "sith", 561 | "six", 562 | "so", 563 | "sobeit", 564 | "soer", 565 | "soest", 566 | "some", 567 | "somebody", 568 | "somehow", 569 | "someone", 570 | "something", 571 | "sometime", 572 | "sometimer", 573 | "sometimes", 574 | "sometimest", 575 | "somewhat", 576 | "somewhere", 577 | "stop", 578 | "stopped", 579 | "such", 580 | "summat", 581 | "sup", 582 | "supped", 583 | "supping", 584 | "sups", 585 | "syn", 586 | "syne", 587 | "t", 588 | "ten", 589 | "than", 590 | "that", 591 | "the", 592 | "thee", 593 | "their", 594 | "theirs", 595 | "them", 596 | "themselves", 597 | "then", 598 | "thence", 599 | "thener", 600 | "thenest", 601 | "there", 602 | "thereafter", 603 | "thereby", 604 | "therefore", 605 | "therein", 606 | "therer", 607 | "therest", 608 | "thereupon", 609 | "these", 610 | "they", 611 | "thine", 612 | "thing", 613 | "things", 614 | "this", 615 | "thises", 616 | "thorough", 617 | "thorougher", 618 | "thoroughest", 619 | "thoroughly", 620 | "those", 621 | "thou", 622 | "though", 623 | "thous", 624 | "thouses", 625 | "three", 626 | "thro", 627 | "through", 628 | "througher", 629 | "throughest", 630 | "throughout", 631 | "thru", 632 | "thruer", 633 | "thruest", 634 | "thus", 635 | "thy", 636 | "thyself", 637 | "till", 638 | "tilled", 639 | "tilling", 640 | "tills", 641 | "to", 642 | "together", 643 | "too", 644 | "toward", 645 | "towarder", 646 | "towardest", 647 | "towards", 648 | "two", 649 | "u", 650 | "umpteen", 651 | "under", 652 | "underneath", 653 | "unless", 654 | "unlike", 655 | "unliker", 656 | "unlikest", 657 | "until", 658 | "unto", 659 | "up", 660 | "upon", 661 | "uponed", 662 | "uponing", 663 | "upons", 664 | "upped", 665 | "upping", 666 | "ups", 667 | "us", 668 | "use", 669 | "used", 670 | "usedest", 671 | "username", 672 | "usually", 673 | "v", 674 | "various", 675 | "variouser", 676 | "variousest", 677 | "verier", 678 | "veriest", 679 | "versus", 680 | "very", 681 | "via", 682 | "vis-a-vis", 683 | "vis-a-viser", 684 | "vis-a-visest", 685 | "viz", 686 | "vs", 687 | "w", 688 | "was", 689 | "wast", 690 | "we", 691 | "were", 692 | "wert", 693 | "what", 694 | "whatever", 695 | "whateverer", 696 | "whateverest", 697 | "whatsoever", 698 | "whatsoeverer", 699 | "whatsoeverest", 700 | "wheen", 701 | "when", 702 | "whenas", 703 | "whence", 704 | "whencesoever", 705 | "whenever", 706 | "whensoever", 707 | "where", 708 | "whereafter", 709 | "whereas", 710 | "whereby", 711 | "wherefrom", 712 | "wherein", 713 | "whereinto", 714 | "whereof", 715 | "whereon", 716 | "wheresoever", 717 | "whereto", 718 | "whereupon", 719 | "wherever", 720 | "wherewith", 721 | "wherewithal", 722 | "whether", 723 | "which", 724 | "whichever", 725 | "whichsoever", 726 | "while", 727 | "whiles", 728 | "whilst", 729 | "whither", 730 | "whithersoever", 731 | "whoever", 732 | "whomever", 733 | "whose", 734 | "whoso", 735 | "whosoever", 736 | "why", 737 | "with", 738 | "withal", 739 | "within", 740 | "without", 741 | "would", 742 | "woulded", 743 | "woulding", 744 | "woulds", 745 | "x", 746 | "y", 747 | "ye", 748 | "yet", 749 | "yon", 750 | "yond", 751 | "yonder", 752 | "you", 753 | "your", 754 | "yours", 755 | "yourself", 756 | "yourselves", 757 | "z", 758 | "zillion", 759 | }; 760 | 761 | /** Default maximum allowed token length */ 762 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; 763 | 764 | private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; 765 | 766 | public static final CharArraySet STOP_WORDS_SET = new CharArraySet(Version.LUCENE_43, 767 | Lists.newArrayList(STOPWORDS), true); 768 | 769 | public PorterAnalyzer() { 770 | super(Version.LUCENE_43, STOP_WORDS_SET); 771 | } 772 | 773 | /** 774 | * Set maximum allowed token length. If a token is seen that exceeds this length then it is 775 | * discarded. This setting only takes effect the next time tokenStream or tokenStream is called. 776 | */ 777 | public void setMaxTokenLength(int length) { 778 | maxTokenLength = length; 779 | } 780 | 781 | public int getMaxTokenLength() { 782 | return maxTokenLength; 783 | } 784 | 785 | @Override 786 | protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { 787 | final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); 788 | src.setMaxTokenLength(maxTokenLength); 789 | TokenStream tok = new StandardFilter(matchVersion, src); 790 | tok = new LowerCaseFilter(matchVersion, tok); 791 | tok = new StopFilter(matchVersion, tok, stopwords); 792 | tok = new PorterStemFilter(tok); 793 | return new TokenStreamComponents(src, tok) { 794 | @Override 795 | protected void setReader(final Reader reader) throws IOException { 796 | src.setMaxTokenLength(PorterAnalyzer.this.maxTokenLength); 797 | super.setReader(reader); 798 | } 799 | }; 800 | } 801 | } 802 | -------------------------------------------------------------------------------- /src/main/java/org/clueweb/util/AnalyzerFactory.java: -------------------------------------------------------------------------------- 1 | package org.clueweb.util; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.util.Version; 5 | import org.clueweb.dictionary.PorterAnalyzer; 6 | 7 | public class AnalyzerFactory { 8 | 9 | public static Analyzer getAnalyzer(String analyzerType) { 10 | if (analyzerType.equals("standard")) { 11 | return new org.apache.lucene.analysis.standard.StandardAnalyzer(Version.LUCENE_43); 12 | } 13 | 14 | if (analyzerType.equals("porter")) { 15 | return new PorterAnalyzer(); 16 | } 17 | 18 | return null; 19 | } 20 | 21 | public static String getOptions() { 22 | return "standard|porter"; 23 | } 24 | } -------------------------------------------------------------------------------- /src/main/java/org/clueweb/util/QuickSort.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you 5 | * may not use this file except in compliance with the License. You may 6 | * obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 13 | * implied. See the License for the specific language governing 14 | * permissions and limitations under the License. 15 | */ 16 | 17 | package org.clueweb.util; 18 | 19 | public class QuickSort { 20 | 21 | // quicksort a[left] to a[right] 22 | public static void quicksort(short[] keys, int[] counts, int left, int right) { 23 | if (right <= left) return; 24 | int i = partition(keys, counts, left, right); 25 | quicksort(keys, counts, left, i-1); 26 | quicksort(keys, counts, i+1, right); 27 | } 28 | 29 | // quicksort a[left] to a[right] 30 | public static void quicksort(int[] keys, int[] counts, short[] counts2, int left, int right) { 31 | if (right <= left) return; 32 | int i = partition(keys, counts, counts2, left, right); 33 | quicksort(keys, counts, counts2, left, i-1); 34 | quicksort(keys, counts, counts2, i+1, right); 35 | } 36 | 37 | 38 | public static void quicksort(short[] keys, int[] counts, Object[] counts2, int left, int right) { 39 | if (right <= left) return; 40 | int i = partition(keys, counts, counts2, left, right); 41 | quicksort(keys, counts, counts2, left, i-1); 42 | quicksort(keys, counts, counts2, i+1, right); 43 | } 44 | 45 | public static void quicksortWithSecondary(int[] keys, int[] counts, short[] counts2, int left, int right) { 46 | if (right <= left) return; 47 | int i = partitionWithSecondary(keys, counts, counts2, left, right); 48 | quicksortWithSecondary(keys, counts, counts2, left, i-1); 49 | quicksortWithSecondary(keys, counts, counts2, i+1, right); 50 | } 51 | 52 | public static void quicksortWithSecondary(int[] keys, int[] counts, long[] counts2, int left, int right) { 53 | if (right <= left) return; 54 | int i = partitionWithSecondary(keys, counts, counts2, left, right); 55 | quicksortWithSecondary(keys, counts, counts2, left, i-1); 56 | quicksortWithSecondary(keys, counts, counts2, i+1, right); 57 | } 58 | 59 | public static void quicksort(int[] keys, int[] counts, int left, int right) { 60 | if (right <= left) return; 61 | int i = partition(keys, counts, left, right); 62 | quicksort(keys, counts, left, i-1); 63 | quicksort(keys, counts, i+1, right); 64 | } 65 | 66 | public static void quicksort(Object[] keys, int[] counts, int left, int right) { 67 | if (right <= left) return; 68 | int i = partition(keys, counts, left, right); 69 | quicksort(keys, counts, left, i-1); 70 | quicksort(keys, counts, i+1, right); 71 | } 72 | 73 | // partition a[left] to a[right], assumes left < right 74 | private static int partition(short[] keys, int[] counts, int left, int right) { 75 | int i = left - 1; 76 | int j = right; 77 | while (true) { 78 | while (counts[++i] < counts[right]) // find item on left to swap 79 | ; // a[right] acts as sentinel 80 | while (counts[right] < counts[--j]) // find item on right to swap 81 | if (j == left) break; // don't go out-of-bounds 82 | if (i >= j) break; // check if pointers cross 83 | int swap = counts[i]; 84 | counts[i] = counts[j]; 85 | counts[j] = swap; // swap two elements into place 86 | 87 | short tmp = keys[i]; 88 | keys[i] = keys[j]; 89 | keys[j] = tmp; 90 | } 91 | int swap = counts[i]; 92 | counts[i] = counts[right]; 93 | counts[right] = swap; 94 | short tmp = keys[i]; 95 | keys[i] = keys[right]; 96 | keys[right] = tmp; 97 | return i; 98 | } 99 | 100 | private static int partition(Object[] keys, int[] counts, int left, int right) { 101 | int i = left - 1; 102 | int j = right; 103 | while (true) { 104 | while (counts[++i] < counts[right]) // find item on left to swap 105 | ; // a[right] acts as sentinel 106 | while (counts[right] < counts[--j]) // find item on right to swap 107 | if (j == left) break; // don't go out-of-bounds 108 | if (i >= j) break; // check if pointers cross 109 | int swap = counts[i]; 110 | counts[i] = counts[j]; 111 | counts[j] = swap; // swap two elements into place 112 | 113 | Object tmp = keys[i]; 114 | keys[i] = keys[j]; 115 | keys[j] = tmp; 116 | } 117 | int swap = counts[i]; 118 | counts[i] = counts[right]; 119 | counts[right] = swap; 120 | Object tmp = keys[i]; 121 | keys[i] = keys[right]; 122 | keys[right] = tmp; 123 | return i; 124 | } 125 | 126 | private static int partition(int[] keys, int[] counts, short[] counts2, int left, int right) { 127 | int i = left - 1; 128 | int j = right; 129 | while (true) { 130 | while (counts[++i] < counts[right]) // find item on left to swap 131 | ; // a[right] acts as sentinel 132 | while (counts[right] < counts[--j]) // find item on right to swap 133 | if (j == left) break; // don't go out-of-bounds 134 | if (i >= j) break; // check if pointers cross 135 | int swap = counts[i]; 136 | counts[i] = counts[j]; 137 | counts[j] = swap; // swap two elements into place 138 | 139 | int tmp = keys[i]; 140 | keys[i] = keys[j]; 141 | keys[j] = tmp; 142 | 143 | short tmp2 = counts2[i]; 144 | counts2[i] = counts2[j]; 145 | counts2[j] = tmp2; 146 | 147 | } 148 | int swap = counts[i]; 149 | counts[i] = counts[right]; 150 | counts[right] = swap; 151 | 152 | int tmp = keys[i]; 153 | keys[i] = keys[right]; 154 | keys[right] = tmp; 155 | 156 | short tmp2 = counts2[i]; 157 | counts2[i] = counts2[right]; 158 | counts2[right] = tmp2; 159 | return i; 160 | } 161 | 162 | private static int partition(short[] keys, int[] counts, Object[] counts2, int left, int right) { 163 | int i = left - 1; 164 | int j = right; 165 | while (true) { 166 | while (counts[++i] < counts[right]) // find item on left to swap 167 | ; // a[right] acts as sentinel 168 | while (counts[right] < counts[--j]) // find item on right to swap 169 | if (j == left) break; // don't go out-of-bounds 170 | if (i >= j) break; // check if pointers cross 171 | int swap = counts[i]; 172 | counts[i] = counts[j]; 173 | counts[j] = swap; // swap two elements into place 174 | 175 | short tmp = keys[i]; 176 | keys[i] = keys[j]; 177 | keys[j] = tmp; 178 | 179 | Object tmp2 = counts2[i]; 180 | counts2[i] = counts2[j]; 181 | counts2[j] = tmp2; 182 | 183 | } 184 | int swap = counts[i]; 185 | counts[i] = counts[right]; 186 | counts[right] = swap; 187 | 188 | short tmp = keys[i]; 189 | keys[i] = keys[right]; 190 | keys[right] = tmp; 191 | 192 | Object tmp2 = counts2[i]; 193 | counts2[i] = counts2[right]; 194 | counts2[right] = tmp2; 195 | return i; 196 | } 197 | 198 | private static int partitionWithSecondary(int[] keys, int[] counts, short[] counts2, int left, int right) { 199 | int i = left - 1; 200 | int j = right; 201 | while (true) { 202 | do{ 203 | i++; 204 | }while (counts[i] < counts[right] || (counts[i] == counts[right] && keys[i] < keys[right])) // find item on left to swap 205 | ; // a[right] acts as sentinel 206 | 207 | //while (counts[++i] < counts[right]) // find item on left to swap 208 | // ; // a[right] acts as sentinel 209 | do{ 210 | j--; 211 | }while (j!=left && (counts[right] < counts[j] || (counts[right] == counts[j] && keys[right]= j) break; // check if pointers cross 216 | int swap = counts[i]; 217 | counts[i] = counts[j]; 218 | counts[j] = swap; // swap two elements into place 219 | 220 | int tmp = keys[i]; 221 | keys[i] = keys[j]; 222 | keys[j] = tmp; 223 | 224 | short tmp2 = counts2[i]; 225 | counts2[i] = counts2[j]; 226 | counts2[j] = tmp2; 227 | 228 | } 229 | int swap = counts[i]; 230 | counts[i] = counts[right]; 231 | counts[right] = swap; 232 | 233 | int tmp = keys[i]; 234 | keys[i] = keys[right]; 235 | keys[right] = tmp; 236 | 237 | short tmp2 = counts2[i]; 238 | counts2[i] = counts2[right]; 239 | counts2[right] = tmp2; 240 | return i; 241 | } 242 | 243 | private static int partitionWithSecondary(int[] keys, int[] counts, long[] counts2, int left, int right) { 244 | int i = left - 1; 245 | int j = right; 246 | while (true) { 247 | do{ 248 | i++; 249 | }while (counts[i] < counts[right] || (counts[i] == counts[right] && keys[i] < keys[right])) // find item on left to swap 250 | ; // a[right] acts as sentinel 251 | 252 | //while (counts[++i] < counts[right]) // find item on left to swap 253 | // ; // a[right] acts as sentinel 254 | do{ 255 | j--; 256 | }while (j!=left && (counts[right] < counts[j] || (counts[right] == counts[j] && keys[right]= j) break; // check if pointers cross 261 | int swap = counts[i]; 262 | counts[i] = counts[j]; 263 | counts[j] = swap; // swap two elements into place 264 | 265 | int tmp = keys[i]; 266 | keys[i] = keys[j]; 267 | keys[j] = tmp; 268 | 269 | long tmp2 = counts2[i]; 270 | counts2[i] = counts2[j]; 271 | counts2[j] = tmp2; 272 | 273 | } 274 | int swap = counts[i]; 275 | counts[i] = counts[right]; 276 | counts[right] = swap; 277 | 278 | int tmp = keys[i]; 279 | keys[i] = keys[right]; 280 | keys[right] = tmp; 281 | 282 | long tmp2 = counts2[i]; 283 | counts2[i] = counts2[right]; 284 | counts2[right] = tmp2; 285 | return i; 286 | } 287 | 288 | private static int partition(int[] keys, int[] counts, int left, int right) { 289 | int i = left - 1; 290 | int j = right; 291 | while (true) { 292 | while (counts[++i] < counts[right]) // find item on left to swap 293 | ; // a[right] acts as sentinel 294 | while (counts[right] < counts[--j]) // find item on right to swap 295 | if (j == left) break; // don't go out-of-bounds 296 | if (i >= j) break; // check if pointers cross 297 | int swap = counts[i]; 298 | counts[i] = counts[j]; 299 | counts[j] = swap; // swap two elements into place 300 | 301 | int tmp = keys[i]; 302 | keys[i] = keys[j]; 303 | keys[j] = tmp; 304 | } 305 | int swap = counts[i]; 306 | counts[i] = counts[right]; 307 | counts[right] = swap; 308 | 309 | int tmp = keys[i]; 310 | keys[i] = keys[right]; 311 | keys[right] = tmp; 312 | 313 | return i; 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 4 | 5 | # Print the date in ISO 8601 format 6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n 7 | log4j.logger.com.ning.http.client=WARN 8 | -------------------------------------------------------------------------------- /src/test/java/org/clueweb/data/PForDocVectorTest.java: -------------------------------------------------------------------------------- 1 | package org.clueweb.data; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | 6 | import java.util.Random; 7 | 8 | import junit.framework.JUnit4TestAdapter; 9 | import me.lemire.integercompression.FastPFOR; 10 | import me.lemire.integercompression.IntWrapper; 11 | 12 | import org.junit.Test; 13 | 14 | import tl.lin.data.array.IntArrayWritable; 15 | 16 | public class PForDocVectorTest { 17 | private static final Random RANDOM = new Random(); 18 | 19 | @Test 20 | public void testPFor1() throws Exception { 21 | int len = 256; 22 | FastPFOR p4 = new FastPFOR(); 23 | int[] doc = new int[len]; 24 | for (int i = 0; i tokens = AnalyzerUtils.parse(analyzer, 23 | "The U.S. Dept. of Justice has announced that Panasonic and its subsidiary Sanyo have been fined $56.5 million for their roles in price fixing conspiracies involving battery cells and car parts."); 24 | 25 | System.out.println(Joiner.on(",").join(tokens)); 26 | assertEquals(19, tokens.size()); 27 | } 28 | 29 | @Test 30 | public void test2() throws Exception { 31 | Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); 32 | List tokens = AnalyzerUtils.parse(analyzer, 33 | "The U.S. Dept. of Justice has announced that Panasonic and its subsidiary Sanyo have been fined $56.5 million for their roles in price fixing conspiracies involving battery cells and car parts."); 34 | 35 | System.out.println(Joiner.on(",").join(tokens)); 36 | assertEquals(23, tokens.size()); 37 | } 38 | 39 | public static junit.framework.Test suite() { 40 | return new JUnit4TestAdapter(PorterAnalyzerTest.class); 41 | } 42 | } --------------------------------------------------------------------------------