├── .gitignore
├── .settings
    ├── org.eclipse.jdt.core.prefs
    └── org.eclipse.jdt.ui.prefs
├── HISTORY.md
├── README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── clueweb
        │   │       ├── clueweb09
        │   │           ├── ClueWeb09WarcRecord.java
        │   │           ├── app
        │   │           │   ├── CountWarcRecordsNew.java
        │   │           │   └── CountWarcRecordsOld.java
        │   │           ├── mapred
        │   │           │   └── ClueWeb09InputFormat.java
        │   │           └── mapreduce
        │   │           │   └── ClueWeb09InputFormat.java
        │   │       ├── clueweb12
        │   │           ├── ClueWeb12WarcRecord.java
        │   │           ├── app
        │   │           │   ├── BuildDictionary.java
        │   │           │   ├── BuildPForDocVectors.java
        │   │           │   ├── BuildVByteDocVectors.java
        │   │           │   ├── BuildWarcTrecIdMapping.java
        │   │           │   ├── ComputeTermStatistics.java
        │   │           │   ├── CountWarcRecordsNew.java
        │   │           │   ├── CountWarcRecordsOld.java
        │   │           │   ├── DumpWarcRecordsToPlainText.java
        │   │           │   ├── DumpWarcRecordsToTermIds.java
        │   │           │   ├── LMRetrieval.java
        │   │           │   ├── LookupWarcTrecIdMapping.java
        │   │           │   ├── MergeTermStatistics.java
        │   │           │   ├── ProcessPForDocVectors.java
        │   │           │   └── ProcessVByteDocVectors.java
        │   │           ├── mapred
        │   │           │   └── ClueWeb12InputFormat.java
        │   │           └── mapreduce
        │   │           │   └── ClueWeb12InputFormat.java
        │   │       ├── data
        │   │           ├── DocVector.java
        │   │           ├── Indexable.java
        │   │           ├── PForDocVector.java
        │   │           ├── TermStatistics.java
        │   │           ├── VByteDocVector.java
        │   │           └── WarcTrecIdMapping.java
        │   │       ├── dictionary
        │   │           ├── DefaultFrequencySortedDictionary.java
        │   │           ├── Dictionary.java
        │   │           ├── DictionaryTransformationStrategy.java
        │   │           ├── FrequencySortedDictionary.java
        │   │           ├── FrontCodedDictionary.java
        │   │           ├── LexicographicallySortedDictionary.java
        │   │           └── PorterAnalyzer.java
        │   │       └── util
        │   │           ├── AnalyzerFactory.java
        │   │           └── QuickSort.java
        └── resources
        │   └── log4j.properties
    └── test
        └── java
            └── org
                └── clueweb
                    ├── data
                        ├── PForDocVectorTest.java
                        └── VByteDocVectorTest.java
                    └── dictionary
                        └── PorterAnalyzerTest.java


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .classpath
3 | .project
4 | target/
5 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_clueweb
3 | formatter_settings_version=12
4 | org.eclipse.jdt.ui.exception.name=e
5 | org.eclipse.jdt.ui.gettersetter.use.is=true
6 | org.eclipse.jdt.ui.keywordthis=false
7 | org.eclipse.jdt.ui.overrideannotation=true
8 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
 1 | Version 0.3
 2 | ===========
 3 | July 28, 2013
 4 | 
 5 | + Added incomplete/untested support for ClueWeb09 (refactored package structure)
 6 | + Added Basic LM Retrieval (currently hard-coded for PForDocVectors)
 7 | 
 8 | Version 0.2
 9 | ===========
10 | July 14, 2013
11 | 
12 | + Refactored package layout, separating ClueWeb12-specific classes
13 | + Added PFor-compressed DocVector, in addition to the previous VByte-compressed version
14 | 
15 | Version 0.1
16 | ===========
17 | July 10, 2013
18 | 
19 | + Initial release
20 | + Ability to build global dictionary on ClueWeb12
21 | + Ability to convert ClueWeb12 into document vectors (with termid representation)
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ClueWeb Tools
  2 | =============
  3 | 
  4 | Hadoop tools for manipulating ClueWeb collections, the most recent of which is [ClueWeb12 collection](http://lemurproject.org/clueweb12/).
  5 | 
  6 | Sign up to the mailing list at [the clueweb-list@cwi.nl mailman page](https://lists.cwi.nl/mailman/listinfo/clueweb-list).
  7 | 
  8 | Getting Stated
  9 | --------------
 10 | 
 11 | You can clone the repo with the following command:
 12 | 
 13 | ```
 14 | $ git clone git://github.com/lintool/clueweb.git
 15 | ``` 
 16 | 
 17 | Once you've cloned the repository, build the package with Maven:
 18 | 
 19 | ```
 20 | $ mvn clean package appassembler:assemble
 21 | ```
 22 | 
 23 | Two notes:
 24 | 
 25 | + `appassembler:assemble` automatically generates a few launch scripts for you.
 26 | + in addition to the normal jar (`clueweb-tools-0.X-SNAPSHOT.jar`), this package uses the [Maven Shade plugin](http://maven.apache.org/plugins/maven-shade-plugin/) to create a "fat jar" (`clueweb-tools-0.X-SNAPSHOT-fatjar.jar`) that includes all dependencies except for Hadoop, so that the jar can be directly submitted via `hadoop jar ...`.
 27 | 
 28 | To automatically generate project files for Eclipse:
 29 | 
 30 | ```
 31 | $ mvn eclipse:clean
 32 | $ mvn eclipse:eclipse
 33 | ```
 34 | 
 35 | You can then use Eclipse's Import "Existing Projects into Workspace" functionality to import the project.
 36 | 
 37 | Counting Records
 38 | ----------------
 39 | 
 40 | For sanity checking and as a "template" for other Hadoop jobs, the package provides a simple program to count WARC records in ClueWeb12:
 41 | 
 42 | ```
 43 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
 44 |  org.clueweb.clueweb12.app.CountClueWarcRecords -input /path/to/warc/files/
 45 | ```
 46 | 
 47 | Examples of `/path/to/warc/files/` are:
 48 | 
 49 | + `/data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz`: for a single ClueWeb12 segment
 50 | + `/data/private/clueweb12/Disk1/ClueWeb12_*/*/*.warc.gz`: for an entire ClueWeb12 disk
 51 | + `/data/private/clueweb12/Disk[1234]/ClueWeb12_*/*/*.warc.gz`: for all of ClueWeb12
 52 | 
 53 | Building a Dictionary
 54 | ---------------------
 55 | 
 56 | The next step is to build a dictionary that provides three capabilities:
 57 | 
 58 | + a bidirectional mapping from terms (strings) to termids (integers)
 59 | + lookup of document frequency (*df*) by term or termid
 60 | + lookup of collection frequency (*cf*) by term or termid
 61 | 
 62 | To build the dictionary, we must first compute the term statistics:
 63 | 
 64 | ```
 65 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
 66 |  org.clueweb.clueweb12.app.ComputeTermStatistics \
 67 |  -input /data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz \
 68 |  -output term-stats/segment00
 69 | ```
 70 | 
 71 | By default, the program throws away all terms with *df* less than 100, but this parameter can be set on the command line. The above command compute term statistics for a segment of ClueWeb12. It's easier to compute term statistics segment by segment to generate smaller and more manageable Hadoop jobs.
 72 | 
 73 | Compute term statistics for all the other segments in the same manner.
 74 | 
 75 | Next, merge all the segment statistics together:
 76 | 
 77 | ```
 78 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
 79 |  org.clueweb.clueweb12.app.MergeTermStatistics \
 80 |  -input term-stats/segment* -output term-stats-all
 81 | ```
 82 | 
 83 | Finally, build the dictionary:
 84 | 
 85 | ```
 86 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
 87 |  org.clueweb.clueweb12.app.BuildDictionary \
 88 |  -input term-stats-all -output dictionary -count 7160086
 89 | ```
 90 | 
 91 | You need to provide the number of terms in the dictionary via the `-count` option. That value is simply the number of output reducers from `MergeTermStatistics`.
 92 | 
 93 | To explore the contents of the dictionary, use this little interactive program:
 94 | 
 95 | ```
 96 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
 97 |  org.clueweb.clueweb12.dictionary.DefaultFrequencySortedDictionary dictionary
 98 | ```
 99 | 
100 | On ClueWeb12, following the above instructions will create a dictionary with 7,160,086 terms.
101 | 
102 | 
103 | **Implementation details:** Tokenization is performed by first using Jsoup throw away all markup information and then passing the resulting text through Lucene's `StandardAnalyzer`.
104 | 
105 | The dictionary has two components: the terms are stored as a front-coded list (which necessarily means that the terms must be sorted); a monotone minimal perfect hash function is used to hash terms (strings) into the lexicographic position. Term to termid lookup is accomplished by the hashing function (to avoid binary searching through the front-coded data structure, which is expensive). Termid to term lookup is accomplished by direct accesses into the front-coded list. An additional mapping table is used to convert the lexicographic position into the (*df*-sorted) termid. 
106 | 
107 | Building Document Vectors
108 | -------------------------
109 | 
110 | With the dictionary, we can now convert the entire collection into a sequence of document vectors, where each document vector is represented by a sequence of termids; the termids map to the sequence of terms that comprise the document. These document vectors are much more compact and much faster to scan for processing purposes.
111 | 
112 | The document vector is represented by the interface `org.clueweb.data.DocVector`. Currently, there are two concrete implementations:
113 | 
114 | + `VByteDocVector`, which uses Hadoop's built-in utilities for writing variable-length integers (what Hadoop calls VInt).
115 | + `PForDocVector`, which uses PFor compression from Daniel Lemire's [JavaFastPFOR](https://github.com/lemire/JavaFastPFOR/) package.
116 | 
117 | To build document vectors, use either `BuildVByteDocVectors` or `BuildPForDocVectors`:
118 | 
119 | ```
120 | hadoop jar target/clueweb-tools-0.X-SNAPSHOT-fatjar.jar \
121 |  org.clueweb.clueweb12.app.Build{VByte,PFor}DocVectors \
122 |  -input /data/private/clueweb12/Disk1/ClueWeb12_00/*/*.warc.gz \
123 |  -output /data/private/clueweb12/derived/docvectors/segment00 \
124 |  -dictionary /data/private/clueweb12/derived/dictionary \
125 |  -reducers 100
126 | ```
127 | 
128 | Once again, it's advisable to run on a segment at a time in order to keep the Hadoop job sizes manageable. Note that the program run identity reducers to repartition the document vectors into 100 parts (to avoid the small files problem).
129 | 
130 | The output directory will contain `SequenceFile`s, with a `Text` containing the WARC-TREC-ID as the key. For VByte, the value will be a `BytesWritable` object; for PFor, the value will be an `IntArrayWritable` object.
131 | 
132 | To process these document vectors, either use `ProcessVByteDocVectors` or `ProcessPForDocVectors` in the `org.clueweb.clueweb12.app` package, which provides sample code for consuming these document vectors and converting the termids back into terms.
133 | 
134 | Size comparisons, on the entire ClueWeb12 collection:
135 | 
136 | + 5.54 TB: original compressed WARC files
137 | + 1.08 TB: repackaged as `VByteDocVector`s
138 | + 0.86 TB: repackaged as `PForDocVector`s
139 | + ~1.6 TB: uncompressed termids (collection size is ~400 billion terms)
140 | 
141 | License
142 | -------
143 | 
144 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
145 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>org.clueweb</groupId>
  4 |   <artifactId>clueweb-tools</artifactId>
  5 |   <packaging>jar</packaging>
  6 |   <version>0.4-SNAPSHOT</version>
  7 |   <name>clueweb-tools</name>
  8 |   <description>Hadoop tools for working with the ClueWeb 2012 collection</description>
  9 |   <url>http://clueweb.org/</url>
 10 | 
 11 |   <licenses>
 12 |     <license>
 13 |       <name>The Apache Software License, Version 2.0</name>
 14 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 15 |       <distribution>repo</distribution>
 16 |     </license>
 17 |   </licenses>
 18 | 
 19 |   <scm>
 20 |     <connection>scm:git:git@github.com:lintool/clueweb.git</connection>
 21 |     <developerConnection>scm:git:git@github.com:lintool/clueweb.git</developerConnection>
 22 |     <url>git@github.com:lintool/clueweb.git</url>
 23 |   </scm>
 24 | 
 25 |   <developers>
 26 |     <developer>
 27 |       <id>lintool</id>
 28 |       <name>Jimmy Lin</name>
 29 |       <email>jimmylin@umd.edu</email>
 30 |     </developer>
 31 |   </developers>
 32 | 
 33 |   <parent>
 34 |     <groupId>org.sonatype.oss</groupId>
 35 |     <artifactId>oss-parent</artifactId>
 36 |     <version>7</version>
 37 |   </parent>
 38 | 
 39 |   <profiles>
 40 |     <profile>
 41 |       <id>default</id>
 42 |       <activation>
 43 |         <activeByDefault>true</activeByDefault>
 44 |       </activation>
 45 |       <build>
 46 |         <plugins>
 47 |           <plugin>
 48 |             <groupId>org.codehaus.mojo</groupId>
 49 |             <artifactId>appassembler-maven-plugin</artifactId>
 50 |             <version>1.3.1</version>
 51 |             <configuration>
 52 |               <programs>
 53 |                 <program>
 54 |                   <mainClass>org.clueweb.clueweb12.app.BuildWarcTrecIdMapping</mainClass>
 55 |                   <name>BuildWarcTrecIdMapping</name>
 56 |                   <jvmSettings>
 57 |                     <maxMemorySize>4g</maxMemorySize>
 58 |                   </jvmSettings>
 59 |                 </program>
 60 |               </programs>
 61 |             </configuration>
 62 |           </plugin>
 63 |           <plugin>
 64 |             <groupId>org.apache.maven.plugins</groupId>
 65 |             <artifactId>maven-compiler-plugin</artifactId>
 66 |             <version>3.1</version>
 67 |             <configuration>
 68 |               <source>1.6</source>
 69 |               <target>1.6</target>
 70 |             </configuration>
 71 |           </plugin>
 72 |           <plugin>
 73 |             <groupId>org.apache.maven.plugins</groupId>
 74 |             <artifactId>maven-shade-plugin</artifactId>
 75 |             <version>2.1</version>
 76 |             <executions>
 77 |               <execution>
 78 |                 <phase>package</phase>
 79 |                 <goals>
 80 |                   <goal>shade</goal>
 81 |                 </goals>
 82 |                 <configuration>
 83 |                   <!-- this will create both a normal thin jar and also a fatjar -->
 84 |                   <shadedArtifactAttached>true</shadedArtifactAttached>
 85 |                   <shadedClassifierName>fatjar</shadedClassifierName>
 86 |                   <artifactSet>
 87 |                     <excludes>
 88 |                       <exclude>org.apache.hadoop:*</exclude>
 89 |                     </excludes>
 90 |                   </artifactSet>
 91 |                 </configuration>
 92 |               </execution>
 93 |             </executions>
 94 |           </plugin>
 95 |         </plugins>
 96 |       </build>
 97 |     </profile>
 98 |     <profile>
 99 |       <id>deploy</id>
100 |       <build />
101 |     </profile>
102 |   </profiles>
103 | 
104 |   <properties>
105 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
106 |     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
107 |   </properties>
108 | 
109 |   <dependencies>
110 |     <dependency>
111 |       <groupId>junit</groupId>
112 |       <artifactId>junit</artifactId>
113 |       <version>4.11</version>
114 |       <scope>test</scope>
115 |     </dependency>
116 |     <dependency>
117 |       <groupId>org.apache.ant</groupId>
118 |       <artifactId>ant</artifactId>
119 |       <version>1.9.1</version>
120 |     </dependency>
121 |     <dependency>
122 |       <groupId>commons-cli</groupId>
123 |       <artifactId>commons-cli</artifactId>
124 |       <version>1.2</version>
125 |     </dependency>
126 |     <dependency>
127 |       <groupId>commons-io</groupId>
128 |       <artifactId>commons-io</artifactId>
129 |       <version>2.4</version>
130 |     </dependency>
131 |     <dependency>
132 |       <groupId>commons-lang</groupId>
133 |       <artifactId>commons-lang</artifactId>
134 |       <version>2.6</version>
135 |     </dependency>
136 |     <dependency>
137 |       <groupId>org.apache.lucene</groupId>
138 |       <artifactId>lucene-core</artifactId>
139 |       <version>4.3.1</version>
140 |     </dependency>
141 |     <dependency>
142 |       <groupId>org.apache.lucene</groupId>
143 |       <artifactId>lucene-queryparser</artifactId>
144 |       <version>4.3.1</version>
145 |     </dependency>
146 |     <dependency>
147 |       <groupId>org.apache.lucene</groupId>
148 |       <artifactId>lucene-analyzers-common</artifactId>
149 |       <version>4.3.1</version>
150 |     </dependency>
151 |     <dependency>
152 |       <groupId>log4j</groupId>
153 |       <artifactId>log4j</artifactId>
154 |       <version>1.2.17</version>
155 |     </dependency>
156 |     <dependency>
157 |       <groupId>org.apache.hadoop</groupId>
158 |       <artifactId>hadoop-core</artifactId>
159 |       <version>1.1.2</version>
160 |     </dependency>
161 |     <dependency>
162 |       <groupId>org.apache.hadoop</groupId>
163 |       <artifactId>hadoop-client</artifactId>
164 |       <version>1.1.2</version>
165 |     </dependency>
166 |     <dependency>
167 |       <groupId>org.jsoup</groupId>
168 |       <artifactId>jsoup</artifactId>
169 |       <version>1.7.2</version>
170 |     </dependency>
171 |     <dependency>
172 |       <groupId>com.google.guava</groupId>
173 |       <artifactId>guava</artifactId>
174 |       <version>14.0.1</version>
175 |     </dependency>
176 |     <dependency>
177 |       <groupId>it.unimi.dsi</groupId>
178 |       <artifactId>dsiutils</artifactId>
179 |       <version>2.0.15</version>
180 |       <optional>true</optional>
181 |     </dependency>
182 |     <dependency>
183 |       <groupId>it.unimi.dsi</groupId>
184 |       <artifactId>sux4j</artifactId>
185 |       <version>3.0.8</version>
186 |       <optional>true</optional>
187 |     </dependency>
188 |     <dependency>
189 |       <groupId>it.unimi.dsi</groupId>
190 |       <artifactId>fastutil</artifactId>
191 |       <version>6.5.4</version>
192 |       <optional>true</optional>
193 |     </dependency>
194 |     <dependency>
195 |       <groupId>tl.lin</groupId>
196 |       <artifactId>lintools-datatypes</artifactId>
197 |       <version>0.9.2</version>
198 |     </dependency>
199 |     <dependency>
200 |       <groupId>tl.lin</groupId>
201 |       <artifactId>lintools-lucene</artifactId>
202 |       <version>0.1.0</version>
203 |     </dependency>
204 |     <dependency>
205 |       <groupId>me.lemire.integercompression</groupId>
206 |       <artifactId>JavaFastPFOR</artifactId>
207 |        <version>0.0.3</version>
208 |     </dependency>
209 |   </dependencies>
210 | </project>
211 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsNew.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb09.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.io.LongWritable;
 31 | import org.apache.hadoop.io.NullWritable;
 32 | import org.apache.hadoop.mapreduce.Counters;
 33 | import org.apache.hadoop.mapreduce.Job;
 34 | import org.apache.hadoop.mapreduce.Mapper;
 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 36 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 37 | import org.apache.hadoop.util.Tool;
 38 | import org.apache.hadoop.util.ToolRunner;
 39 | import org.apache.log4j.Logger;
 40 | import org.clueweb.clueweb09.ClueWeb09WarcRecord;
 41 | import org.clueweb.clueweb09.mapreduce.ClueWeb09InputFormat;
 42 | 
 43 | public class CountWarcRecordsNew extends Configured implements Tool {
 44 |   private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class);
 45 | 
 46 |   private static enum Records { TOTAL, PAGES };
 47 | 
 48 |   private static class MyMapper
 49 |       extends Mapper<LongWritable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
 50 |     @Override
 51 |     public void map(LongWritable key, ClueWeb09WarcRecord doc, Context context)
 52 |         throws IOException, InterruptedException {
 53 |       context.getCounter(Records.TOTAL).increment(1);
 54 | 
 55 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 56 |       if (docid != null) {
 57 |         context.getCounter(Records.PAGES).increment(1);
 58 |       }
 59 |     }
 60 |   }
 61 | 
 62 |   public CountWarcRecordsNew() {}
 63 | 
 64 |   public static final String INPUT_OPTION = "input";
 65 | 
 66 |   /**
 67 |    * Runs this tool.
 68 |    */
 69 |   @SuppressWarnings("static-access")
 70 |   public int run(String[] args) throws Exception {
 71 |     Options options = new Options();
 72 | 
 73 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 74 |         .withDescription("input path").create(INPUT_OPTION));
 75 | 
 76 |     CommandLine cmdline;
 77 |     CommandLineParser parser = new GnuParser();
 78 |     try {
 79 |       cmdline = parser.parse(options, args);
 80 |     } catch (ParseException exp) {
 81 |       HelpFormatter formatter = new HelpFormatter();
 82 |       formatter.printHelp(this.getClass().getName(), options);
 83 |       ToolRunner.printGenericCommandUsage(System.out);
 84 |       System.err.println("Error parsing command line: " + exp.getMessage());
 85 |       return -1;
 86 |     }
 87 | 
 88 |     if (!cmdline.hasOption(INPUT_OPTION)) {
 89 |       HelpFormatter formatter = new HelpFormatter();
 90 |       formatter.printHelp(this.getClass().getName(), options);
 91 |       ToolRunner.printGenericCommandUsage(System.out);
 92 |       return -1;
 93 |     }
 94 | 
 95 |     String input = cmdline.getOptionValue(INPUT_OPTION);
 96 | 
 97 |     LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
 98 |     LOG.info(" - input: " + input);
 99 | 
100 |     Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
101 |     job.setJarByClass(CountWarcRecordsNew.class);
102 |     job.setNumReduceTasks(0);
103 | 
104 |     FileInputFormat.addInputPaths(job, input);
105 | 
106 |     job.setInputFormatClass(ClueWeb09InputFormat.class);
107 |     job.setOutputFormatClass(NullOutputFormat.class);
108 |     job.setMapperClass(MyMapper.class);
109 | 
110 |     job.waitForCompletion(true);
111 | 
112 |     Counters counters = job.getCounters();
113 |     int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
114 |     LOG.info("Read " + numDocs + " docs.");
115 | 
116 |     return 0;
117 |   }
118 | 
119 |   /**
120 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
121 |    */
122 |   public static void main(String[] args) throws Exception {
123 |     LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args "
124 |         + Arrays.toString(args));
125 |     ToolRunner.run(new CountWarcRecordsNew(), args);
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb09/app/CountWarcRecordsOld.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb09.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.io.NullWritable;
 31 | import org.apache.hadoop.io.Writable;
 32 | import org.apache.hadoop.mapred.Counters;
 33 | import org.apache.hadoop.mapred.FileInputFormat;
 34 | import org.apache.hadoop.mapred.JobClient;
 35 | import org.apache.hadoop.mapred.JobConf;
 36 | import org.apache.hadoop.mapred.MapReduceBase;
 37 | import org.apache.hadoop.mapred.Mapper;
 38 | import org.apache.hadoop.mapred.OutputCollector;
 39 | import org.apache.hadoop.mapred.Reporter;
 40 | import org.apache.hadoop.mapred.RunningJob;
 41 | import org.apache.hadoop.mapred.lib.NullOutputFormat;
 42 | import org.apache.hadoop.util.Tool;
 43 | import org.apache.hadoop.util.ToolRunner;
 44 | import org.apache.log4j.Logger;
 45 | import org.clueweb.clueweb09.ClueWeb09WarcRecord;
 46 | import org.clueweb.clueweb09.mapred.ClueWeb09InputFormat;
 47 | 
 48 | public class CountWarcRecordsOld extends Configured implements Tool {
 49 |   private static final Logger LOG = Logger.getLogger(CountWarcRecordsOld.class);
 50 | 
 51 |   private static enum Records { TOTAL, PAGES };
 52 | 
 53 |   private static class MyMapper extends MapReduceBase implements
 54 |       Mapper<Writable, ClueWeb09WarcRecord, NullWritable, NullWritable> {
 55 | 
 56 |     public void configure(JobConf job) {}
 57 | 
 58 |     public void map(Writable key, ClueWeb09WarcRecord doc,
 59 |         OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException {
 60 |       reporter.incrCounter(Records.TOTAL, 1);
 61 | 
 62 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 63 |       if (docid != null) {
 64 |         reporter.incrCounter(Records.PAGES, 1);
 65 |       }
 66 |     }
 67 |   }
 68 | 
 69 |   public CountWarcRecordsOld() {
 70 |   }
 71 | 
 72 |   public static final String INPUT_OPTION = "input";
 73 | 
 74 |   /**
 75 |    * Runs this tool.
 76 |    */
 77 |   @SuppressWarnings("static-access")
 78 |   public int run(String[] args) throws Exception {
 79 |     Options options = new Options();
 80 | 
 81 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 82 |         .withDescription("input path").create(INPUT_OPTION));
 83 | 
 84 |     CommandLine cmdline;
 85 |     CommandLineParser parser = new GnuParser();
 86 |     try {
 87 |       cmdline = parser.parse(options, args);
 88 |     } catch (ParseException exp) {
 89 |       HelpFormatter formatter = new HelpFormatter();
 90 |       formatter.printHelp(this.getClass().getName(), options);
 91 |       ToolRunner.printGenericCommandUsage(System.out);
 92 |       System.err.println("Error parsing command line: " + exp.getMessage());
 93 |       return -1;
 94 |     }
 95 | 
 96 |     if (!cmdline.hasOption(INPUT_OPTION)) {
 97 |       HelpFormatter formatter = new HelpFormatter();
 98 |       formatter.printHelp(this.getClass().getName(), options);
 99 |       ToolRunner.printGenericCommandUsage(System.out);
100 |       return -1;
101 |     }
102 | 
103 |     String input = cmdline.getOptionValue(INPUT_OPTION);
104 | 
105 |     LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName());
106 |     LOG.info(" - input: " + input);
107 | 
108 |     JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class);
109 |     conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input);
110 | 
111 |     conf.setNumReduceTasks(0);
112 | 
113 |     FileInputFormat.addInputPaths(conf, input);
114 | 
115 |     conf.setInputFormat(ClueWeb09InputFormat.class);
116 |     conf.setOutputFormat(NullOutputFormat.class);
117 |     conf.setMapperClass(MyMapper.class);
118 | 
119 |     RunningJob job = JobClient.runJob(conf);
120 |     Counters counters = job.getCounters();
121 |     int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
122 | 
123 |     LOG.info("Read " + numDocs + " docs.");
124 | 
125 |     return 0;
126 |   }
127 | 
128 |   /**
129 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
130 |    */
131 |   public static void main(String[] args) throws Exception {
132 |     LOG.info("Running " + CountWarcRecordsOld.class.getCanonicalName() + " with args "
133 |         + Arrays.toString(args));
134 |     ToolRunner.run(new CountWarcRecordsOld(), args);
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb09/mapred/ClueWeb09InputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | /*
 18 |  * Hadoop FileInputFormat for reading WARC files
 19 |  *
 20 |  * (C) 2009 - Carnegie Mellon University
 21 |  *
 22 |  * 1. Redistributions of this source code must retain the above copyright
 23 |  *    notice, this list of conditions and the following disclaimer.
 24 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",
 25 |  *    "Carnegie Mellon", and "lemurproject" must not be used to
 26 |  *    endorse or promote products derived from this software without
 27 |  *    prior written permission. To obtain permission, contact
 28 |  *    license@lemurproject.org.
 29 |  *
 30 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
 31 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
 32 |  *    permission of The Lemur Project. To obtain permission,
 33 |  *    contact license@lemurproject.org.
 34 |  *
 35 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 36 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 37 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 38 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 39 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 40 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 41 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 42 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 43 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 44 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 45 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 46 |  * POSSIBILITY OF SUCH DAMAGE.
 47 |  *
 48 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 49 |  */
 50 | 
 51 | package org.clueweb.clueweb09.mapred;
 52 | 
 53 | import java.io.DataInputStream;
 54 | import java.io.IOException;
 55 | 
 56 | import org.apache.hadoop.conf.Configuration;
 57 | import org.apache.hadoop.fs.FileSystem;
 58 | import org.apache.hadoop.fs.Path;
 59 | import org.apache.hadoop.io.LongWritable;
 60 | import org.apache.hadoop.io.compress.CompressionCodec;
 61 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
 62 | import org.apache.hadoop.mapred.FileInputFormat;
 63 | import org.apache.hadoop.mapred.FileSplit;
 64 | import org.apache.hadoop.mapred.InputSplit;
 65 | import org.apache.hadoop.mapred.JobConf;
 66 | import org.apache.hadoop.mapred.RecordReader;
 67 | import org.apache.hadoop.mapred.Reporter;
 68 | import org.clueweb.clueweb09.ClueWeb09WarcRecord;
 69 | 
 70 | public class ClueWeb09InputFormat extends FileInputFormat<LongWritable, ClueWeb09WarcRecord> {
 71 | 
 72 |   /**
 73 |    * Don't allow the files to be split!
 74 |    */
 75 |   @Override
 76 |   protected boolean isSplitable(FileSystem fs, Path filename) {
 77 |     // ensure the input files are not splittable!
 78 |     return false;
 79 |   }
 80 | 
 81 |   /**
 82 |    * Just return the record reader
 83 |    */
 84 |   public RecordReader<LongWritable, ClueWeb09WarcRecord> getRecordReader(InputSplit split, JobConf conf,
 85 |       Reporter reporter) throws IOException {
 86 |     return new ClueWarcRecordReader(conf, (FileSplit) split);
 87 |   }
 88 | 
 89 |   public static class ClueWarcRecordReader implements RecordReader<LongWritable, ClueWeb09WarcRecord> {
 90 |     private long recordCount = 1;
 91 |     private Path path = null;
 92 |     private DataInputStream input = null;
 93 | 
 94 |     private long totalNumBytesRead = 0;
 95 | 
 96 |     public ClueWarcRecordReader(Configuration conf, FileSplit split) throws IOException {
 97 |       FileSystem fs = FileSystem.get(conf);
 98 |       path = split.getPath();
 99 | 
100 |       CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
101 |       CompressionCodec compressionCodec = compressionCodecs.getCodec(path);
102 |       input = new DataInputStream(compressionCodec.createInputStream(fs.open(path)));
103 |     }
104 | 
105 |     @Override
106 |     public boolean next(LongWritable key, ClueWeb09WarcRecord value) throws IOException {
107 |       DataInputStream whichStream = input;
108 | 
109 |       ClueWeb09WarcRecord newRecord = ClueWeb09WarcRecord.readNextWarcRecord(whichStream);
110 |       if (newRecord == null) {
111 |         return false;
112 |       }
113 | 
114 |       totalNumBytesRead += (long) newRecord.getTotalRecordLength();
115 |       newRecord.setWarcFilePath(path.toString());
116 | 
117 |       value.set(newRecord);
118 |       key.set(recordCount);
119 | 
120 |       recordCount++;
121 |       return true;
122 |     }
123 | 
124 |     @Override
125 |     public LongWritable createKey() {
126 |       return new LongWritable();
127 |     }
128 | 
129 |     @Override
130 |     public ClueWeb09WarcRecord createValue() {
131 |       return new ClueWeb09WarcRecord();
132 |     }
133 | 
134 |     @Override
135 |     public long getPos() throws IOException {
136 |       return totalNumBytesRead;
137 |     }
138 | 
139 |     @Override
140 |     public void close() throws IOException {
141 |       input.close();
142 |     }
143 | 
144 |     @Override
145 |     public float getProgress() throws IOException {
146 |       return (float) recordCount / 40000f;
147 |     }
148 |   }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb09/mapreduce/ClueWeb09InputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb09.mapreduce;
 18 | 
 19 | import java.io.DataInputStream;
 20 | import java.io.IOException;
 21 | 
 22 | import org.apache.hadoop.conf.Configuration;
 23 | import org.apache.hadoop.fs.FSDataInputStream;
 24 | import org.apache.hadoop.fs.FileSystem;
 25 | import org.apache.hadoop.fs.Path;
 26 | import org.apache.hadoop.fs.Seekable;
 27 | import org.apache.hadoop.io.LongWritable;
 28 | import org.apache.hadoop.io.compress.CodecPool;
 29 | import org.apache.hadoop.io.compress.CompressionCodec;
 30 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
 31 | import org.apache.hadoop.io.compress.Decompressor;
 32 | import org.apache.hadoop.mapreduce.InputSplit;
 33 | import org.apache.hadoop.mapreduce.JobContext;
 34 | import org.apache.hadoop.mapreduce.RecordReader;
 35 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 38 | import org.clueweb.clueweb09.ClueWeb09WarcRecord;
 39 | 
 40 | public class ClueWeb09InputFormat extends FileInputFormat<LongWritable, ClueWeb09WarcRecord> {
 41 |   @Override
 42 |   public RecordReader<LongWritable, ClueWeb09WarcRecord> createRecordReader(InputSplit split,
 43 |       TaskAttemptContext context) throws IOException, InterruptedException {
 44 |     return new ClueWarcRecordReader();
 45 |   }
 46 | 
 47 |   @Override
 48 |   protected boolean isSplitable(JobContext context, Path filename) {
 49 |     return false;
 50 |   }
 51 |   
 52 |   public class ClueWarcRecordReader extends RecordReader<LongWritable, ClueWeb09WarcRecord> {
 53 |     private CompressionCodecFactory compressionCodecs = null;
 54 |     private long start;
 55 |     private long pos;
 56 |     private long end;
 57 |     private LongWritable key = null;
 58 |     private ClueWeb09WarcRecord value = null;
 59 |     private Seekable filePosition;
 60 |     private CompressionCodec codec;
 61 |     private Decompressor decompressor;
 62 |     private DataInputStream in;
 63 | 
 64 |     public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 65 |       FileSplit split = (FileSplit) genericSplit;
 66 |       Configuration job = context.getConfiguration();
 67 |       start = split.getStart();
 68 |       end = start + split.getLength();
 69 |       final Path file = split.getPath();
 70 |       compressionCodecs = new CompressionCodecFactory(job);
 71 |       codec = compressionCodecs.getCodec(file);
 72 | 
 73 |       // open the file and seek to the start of the split
 74 |       FileSystem fs = file.getFileSystem(job);
 75 |       FSDataInputStream fileIn = fs.open(split.getPath());
 76 | 
 77 |       if (isCompressedInput()) {
 78 |         in = new DataInputStream(codec.createInputStream(fileIn, decompressor));
 79 |         filePosition = fileIn;
 80 |       } else {
 81 |         fileIn.seek(start);
 82 |         in = fileIn;
 83 |         filePosition = fileIn;
 84 |       }
 85 | 
 86 |       this.pos = start;
 87 |     }
 88 | 
 89 |     private boolean isCompressedInput() {
 90 |       return (codec != null);
 91 |     }
 92 | 
 93 |     private long getFilePosition() throws IOException {
 94 |       long retVal;
 95 |       if (isCompressedInput() && null != filePosition) {
 96 |         retVal = filePosition.getPos();
 97 |       } else {
 98 |         retVal = pos;
 99 |       }
100 |       return retVal;
101 |     }
102 | 
103 |     public boolean nextKeyValue() throws IOException {
104 |       if (key == null) {
105 |         key = new LongWritable();
106 |       }
107 |       key.set(pos);
108 | 
109 |       value = ClueWeb09WarcRecord.readNextWarcRecord(in);
110 |       if (value == null) {
111 |         return false;
112 |       }
113 |       return true;
114 |     }
115 | 
116 |     @Override
117 |     public LongWritable getCurrentKey() {
118 |       return key;
119 |     }
120 | 
121 |     @Override
122 |     public ClueWeb09WarcRecord getCurrentValue() {
123 |       return value;
124 |     }
125 | 
126 |     /**
127 |      * Get the progress within the split
128 |      */
129 |     public float getProgress() throws IOException {
130 |       if (start == end) {
131 |         return 0.0f;
132 |       } else {
133 |         return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start));
134 |       }
135 |     }
136 | 
137 |     public synchronized void close() throws IOException {
138 |       try {
139 |         if (in != null) {
140 |           in.close();
141 |         }
142 |       } finally {
143 |         if (decompressor != null) {
144 |           CodecPool.returnDecompressor(decompressor);
145 |         }
146 |       }
147 |     }
148 |   }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/BuildDictionary.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
 20 | import it.unimi.dsi.util.FrontCodedStringList;
 21 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
 22 | 
 23 | import java.io.ByteArrayOutputStream;
 24 | import java.io.IOException;
 25 | import java.io.ObjectOutputStream;
 26 | import java.util.Arrays;
 27 | import java.util.Iterator;
 28 | import java.util.List;
 29 | 
 30 | import org.apache.commons.cli.CommandLine;
 31 | import org.apache.commons.cli.CommandLineParser;
 32 | import org.apache.commons.cli.GnuParser;
 33 | import org.apache.commons.cli.HelpFormatter;
 34 | import org.apache.commons.cli.OptionBuilder;
 35 | import org.apache.commons.cli.Options;
 36 | import org.apache.commons.cli.ParseException;
 37 | import org.apache.hadoop.conf.Configuration;
 38 | import org.apache.hadoop.conf.Configured;
 39 | import org.apache.hadoop.fs.FSDataOutputStream;
 40 | import org.apache.hadoop.fs.FileSystem;
 41 | import org.apache.hadoop.fs.Path;
 42 | import org.apache.hadoop.io.NullWritable;
 43 | import org.apache.hadoop.io.Text;
 44 | import org.apache.hadoop.io.WritableUtils;
 45 | import org.apache.hadoop.mapreduce.Job;
 46 | import org.apache.hadoop.mapreduce.Mapper;
 47 | import org.apache.hadoop.mapreduce.Reducer;
 48 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 49 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 50 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 51 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 52 | import org.apache.hadoop.util.Tool;
 53 | import org.apache.hadoop.util.ToolRunner;
 54 | import org.apache.log4j.Logger;
 55 | import org.clueweb.dictionary.DictionaryTransformationStrategy;
 56 | import org.clueweb.util.QuickSort;
 57 | 
 58 | import tl.lin.data.pair.PairOfIntLong;
 59 | 
 60 | import com.google.common.collect.Lists;
 61 | 
 62 | public class BuildDictionary extends Configured implements Tool {
 63 |   private static final Logger LOG = Logger.getLogger(BuildDictionary.class);
 64 | 
 65 |   private static final String HADOOP_OUTPUT_OPTION = "dictionary.path";
 66 |   private static final String HADOOP_TERMS_COUNT_OPTION = "terms.count";
 67 | 
 68 |   protected static enum Terms { Total }
 69 | 
 70 |   public static final String TERMS_DATA = "dictionary.terms";
 71 |   public static final String TERMS_ID_DATA = "dictionary.ids";
 72 |   public static final String TERMS_ID_MAPPING_DATA = "dictionary.mapping";
 73 | 
 74 |   public static final String DF_BY_TERM_DATA = "df.terms";
 75 |   public static final String DF_BY_ID_DATA = "df.ids";
 76 | 
 77 |   public static final String CF_BY_TERM_DATA = "cf.terms";
 78 |   public static final String CF_BY_ID_DATA = "cf.ids";
 79 | 
 80 |   private static class MyReducer
 81 |       extends Reducer<Text, PairOfIntLong, NullWritable, NullWritable> {
 82 |     private FSDataOutputStream termsOut, idsOut, idsToTermOut,
 83 |         dfByTermOut, cfByTermOut, dfByIntOut, cfByIntOut;
 84 |     private int numTerms;
 85 |     private int[] seqNums = null;
 86 |     private int[] dfs = null;
 87 |     private long[] cfs = null;
 88 |     private int curKeyIndex = 0;
 89 | 
 90 |     private String[] terms;
 91 | 
 92 |     @Override
 93 |     public void setup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
 94 |         throws IOException {
 95 |       LOG.info("Starting setup.");
 96 |       Configuration conf = context.getConfiguration();
 97 |       FileSystem fs = FileSystem.get(conf);
 98 | 
 99 |       numTerms = conf.getInt(HADOOP_TERMS_COUNT_OPTION, 0);
100 |       LOG.info(HADOOP_TERMS_COUNT_OPTION + ": " + numTerms);
101 |       String basePath = conf.get(HADOOP_OUTPUT_OPTION);
102 |       LOG.info(HADOOP_OUTPUT_OPTION + ": " + basePath);
103 | 
104 |       terms = new String[numTerms];
105 |       seqNums = new int[numTerms];
106 |       dfs = new int[numTerms];
107 |       cfs = new long[numTerms];
108 | 
109 |       termsOut = fs.create(new Path(basePath, TERMS_DATA), true);
110 | 
111 |       idsOut = fs.create(new Path(basePath, TERMS_ID_DATA), true);
112 |       idsOut.writeInt(numTerms);
113 | 
114 |       idsToTermOut = fs.create(new Path(basePath, TERMS_ID_MAPPING_DATA), true);
115 |       idsToTermOut.writeInt(numTerms);
116 | 
117 |       dfByTermOut = fs.create(new Path(basePath, DF_BY_TERM_DATA), true);
118 |       dfByTermOut.writeInt(numTerms);
119 | 
120 |       cfByTermOut = fs.create(new Path(basePath, CF_BY_TERM_DATA), true);
121 |       cfByTermOut.writeInt(numTerms);
122 | 
123 |       dfByIntOut = fs.create(new Path(basePath, DF_BY_ID_DATA), true);
124 |       dfByIntOut.writeInt(numTerms);
125 | 
126 |       cfByIntOut = fs.create(new Path(basePath, CF_BY_ID_DATA), true);
127 |       cfByIntOut.writeInt(numTerms);
128 |       LOG.info("Finished setup.");
129 |     }
130 | 
131 |     @Override
132 |     public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
133 |         throws IOException, InterruptedException {
134 |       String term = key.toString();
135 |       Iterator<PairOfIntLong> iter = values.iterator();
136 |       PairOfIntLong p = iter.next();
137 |       int df = p.getLeftElement();
138 |       long cf = p.getRightElement();
139 |       WritableUtils.writeVInt(dfByTermOut, df);
140 |       WritableUtils.writeVLong(cfByTermOut, cf);
141 | 
142 |       if (iter.hasNext()) {
143 |         throw new RuntimeException("More than one record for term: " + term);
144 |       }
145 | 
146 |       terms[curKeyIndex] = term;
147 |       seqNums[curKeyIndex] = curKeyIndex;
148 |       dfs[curKeyIndex] = -df;
149 |       cfs[curKeyIndex] = cf;
150 |       curKeyIndex++;
151 | 
152 |       context.getCounter(Terms.Total).increment(1);
153 |     }
154 | 
155 |     @Override
156 |     public void cleanup(
157 |         Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
158 |         throws IOException {
159 |       LOG.info("Starting cleanup.");
160 |       if (curKeyIndex != numTerms) {
161 |         throw new RuntimeException("Total expected Terms: " + numTerms +
162 |             ", Total observed terms: " + curKeyIndex + "!");
163 |       }
164 |       // Sort based on df and change seqNums accordingly.
165 |       QuickSort.quicksortWithSecondary(seqNums, dfs, cfs, 0, numTerms - 1);
166 | 
167 |       // Write sorted dfs and cfs by int here.
168 |       for (int i = 0; i < numTerms; i++) {
169 |         WritableUtils.writeVInt(dfByIntOut, -dfs[i]);
170 |         WritableUtils.writeVLong(cfByIntOut, cfs[i]);
171 |       }
172 |       cfs = null;
173 | 
174 |       // Encode the sorted dfs into ids ==> df values erased and become ids instead. Note that first
175 |       // term id is 1.
176 |       for (int i = 0; i < numTerms; i++) {
177 |         dfs[i] = i + 1;
178 |       }
179 | 
180 |       // Write current seq nums to be index into the term array.
181 |       for (int i = 0; i < numTerms; i++)
182 |         idsToTermOut.writeInt(seqNums[i]);
183 | 
184 |       // Sort on seqNums to get the right writing order.
185 |       QuickSort.quicksort(dfs, seqNums, 0, numTerms - 1);
186 |       for (int i = 0; i < numTerms; i++) {
187 |         idsOut.writeInt(dfs[i]);
188 |       }
189 | 
190 |       ByteArrayOutputStream bytesOut;
191 |       ObjectOutputStream objOut;
192 |       byte[] bytes;
193 | 
194 |       List<String> termList = Lists.newArrayList(terms);
195 |       FrontCodedStringList frontcodedList = new FrontCodedStringList(termList, 8, true);
196 | 
197 |       bytesOut = new ByteArrayOutputStream();
198 |       objOut = new ObjectOutputStream(bytesOut);
199 |       objOut.writeObject(frontcodedList);
200 |       objOut.close();
201 | 
202 |       bytes = bytesOut.toByteArray();
203 |       termsOut.writeInt(bytes.length);
204 |       termsOut.write(bytes);
205 | 
206 |       ShiftAddXorSignedStringMap dict = new ShiftAddXorSignedStringMap(termList.iterator(),
207 |           new TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>(termList,
208 |               DictionaryTransformationStrategy.getStrategy()));
209 | 
210 |       bytesOut = new ByteArrayOutputStream();
211 |       objOut = new ObjectOutputStream(bytesOut);
212 |       objOut.writeObject(dict);
213 |       objOut.close();
214 | 
215 |       bytes = bytesOut.toByteArray();
216 |       termsOut.writeInt(bytes.length);
217 |       termsOut.write(bytes);
218 | 
219 |       termsOut.close();
220 |       idsOut.close();
221 |       idsToTermOut.close();
222 |       dfByTermOut.close();
223 |       cfByTermOut.close();
224 |       dfByIntOut.close();
225 |       cfByIntOut.close();
226 |       LOG.info("Finished cleanup.");
227 |     }
228 |   }
229 | 
230 |   public static final String INPUT_OPTION = "input";
231 |   public static final String OUTPUT_OPTION = "output";
232 |   public static final String COUNT_OPTION = "count";
233 | 
234 |   /**
235 |    * Runs this tool.
236 |    */
237 |   @SuppressWarnings("static-access")
238 |   public int run(String[] args) throws Exception {
239 |     Options options = new Options();
240 | 
241 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
242 |         .withDescription("input path").create(INPUT_OPTION));
243 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
244 |         .withDescription("output path").create(OUTPUT_OPTION));
245 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
246 |         .withDescription("number of terms").create(COUNT_OPTION));
247 | 
248 |     CommandLine cmdline;
249 |     CommandLineParser parser = new GnuParser();
250 |     try {
251 |       cmdline = parser.parse(options, args);
252 |     } catch (ParseException exp) {
253 |       HelpFormatter formatter = new HelpFormatter();
254 |       formatter.printHelp(this.getClass().getName(), options);
255 |       ToolRunner.printGenericCommandUsage(System.out);
256 |       System.err.println("Error parsing command line: " + exp.getMessage());
257 |       return -1;
258 |     }
259 | 
260 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) ||
261 |         !cmdline.hasOption(COUNT_OPTION)) {
262 |       HelpFormatter formatter = new HelpFormatter();
263 |       formatter.printHelp(this.getClass().getName(), options);
264 |       ToolRunner.printGenericCommandUsage(System.out);
265 |       return -1;
266 |     }
267 | 
268 |     String input = cmdline.getOptionValue(INPUT_OPTION);
269 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
270 | 
271 |     LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName());
272 |     LOG.info(" - input: " + input);
273 |     LOG.info(" - output: " + output);
274 | 
275 |     Configuration conf = getConf();
276 | 
277 |     conf.set(HADOOP_OUTPUT_OPTION, output);
278 |     conf.setInt(HADOOP_TERMS_COUNT_OPTION,
279 |         Integer.parseInt(cmdline.getOptionValue(COUNT_OPTION)));
280 |     conf.set("mapreduce.map.memory.mb", "2048");
281 |     conf.set("mapreduce.map.java.opts", "-Xmx2048m");
282 |     conf.set("mapreduce.reduce.memory.mb", "2048");
283 |     conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
284 | 
285 |     Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + input);
286 | 
287 |     job.setJarByClass(BuildDictionary.class);
288 |     job.setNumReduceTasks(1);
289 | 
290 |     FileInputFormat.setInputPaths(job, new Path(input));
291 |     FileOutputFormat.setOutputPath(job, new Path(output));
292 | 
293 |     job.setInputFormatClass(SequenceFileInputFormat.class);
294 |     job.setOutputFormatClass(NullOutputFormat.class);
295 | 
296 |     job.setMapOutputKeyClass(Text.class);
297 |     job.setMapOutputValueClass(PairOfIntLong.class);
298 |     job.setOutputKeyClass(Text.class);
299 |     job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class);
300 | 
301 |     job.setMapperClass(Mapper.class);
302 |     job.setReducerClass(MyReducer.class);
303 | 
304 |     FileSystem.get(getConf()).delete(new Path(output), true);
305 |     long startTime = System.currentTimeMillis();
306 |     job.waitForCompletion(true);
307 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
308 | 
309 |     return 0;
310 |   }
311 | 
312 |   /**
313 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
314 |    */
315 |   public static void main(String[] args) throws Exception {
316 |     LOG.info("Running " + BuildDictionary.class.getCanonicalName() + " with args "
317 |         + Arrays.toString(args));
318 |     ToolRunner.run(new BuildDictionary(), args);
319 |   }
320 | }
321 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/BuildPForDocVectors.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.LongWritable;
 34 | import org.apache.hadoop.io.Text;
 35 | import org.apache.hadoop.mapreduce.Job;
 36 | import org.apache.hadoop.mapreduce.Mapper;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 40 | import org.apache.hadoop.util.Tool;
 41 | import org.apache.hadoop.util.ToolRunner;
 42 | import org.apache.log4j.Logger;
 43 | import org.apache.lucene.analysis.Analyzer;
 44 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 45 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat;
 46 | import org.clueweb.data.PForDocVector;
 47 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary;
 48 | import org.clueweb.util.AnalyzerFactory;
 49 | import org.jsoup.Jsoup;
 50 | 
 51 | import tl.lin.data.array.IntArrayWritable;
 52 | import tl.lin.lucene.AnalyzerUtils;
 53 | 
 54 | public class BuildPForDocVectors extends Configured implements Tool {
 55 |   private static final Logger LOG = Logger.getLogger(BuildPForDocVectors.class);
 56 | 
 57 |   private static enum Records {
 58 |     TOTAL, PAGES, ERRORS, TOO_LONG
 59 |   };
 60 | 
 61 |   private static Analyzer ANALYZER;
 62 | 
 63 |   private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this.
 64 | 
 65 |   private static class MyMapper extends
 66 |       Mapper<LongWritable, ClueWeb12WarcRecord, Text, IntArrayWritable> {
 67 |     private static final Text DOCID = new Text();
 68 |     private static final IntArrayWritable DOC = new IntArrayWritable();
 69 | 
 70 |     private DefaultFrequencySortedDictionary dictionary;
 71 | 
 72 |     @Override
 73 |     public void setup(Context context) throws IOException {
 74 |       FileSystem fs = FileSystem.get(context.getConfiguration());
 75 |       String path = context.getConfiguration().get(DICTIONARY_OPTION);
 76 |       dictionary = new DefaultFrequencySortedDictionary(path, fs);
 77 | 
 78 |       String analyzerType = context.getConfiguration().get(PREPROCESSING);
 79 |       ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType);
 80 |       if (ANALYZER == null) {
 81 |         LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName());
 82 |         System.exit(1);
 83 |       }
 84 |     }
 85 | 
 86 |     @Override
 87 |     public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context)
 88 |         throws IOException, InterruptedException {
 89 |       context.getCounter(Records.TOTAL).increment(1);
 90 | 
 91 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 92 |       if (docid != null) {
 93 |         DOCID.set(docid);
 94 | 
 95 |         context.getCounter(Records.PAGES).increment(1);
 96 |         try {
 97 |           String content = doc.getContent();
 98 | 
 99 |           // If the document is excessively long, it usually means that something is wrong (e.g., a
100 |           // binary object). Skip so the parsing doesn't choke.
101 |           // As an alternative, we might want to consider putting in a timeout, e.g.,
102 |           // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread
103 |           if (content.length() > MAX_DOC_LENGTH) {
104 |             LOG.info("Skipping " + docid + " due to excessive length: " + content.length());
105 |             context.getCounter(Records.TOO_LONG).increment(1);
106 |             PForDocVector.toIntArrayWritable(DOC, new int[] {}, 0);
107 |             context.write(DOCID, DOC);
108 |             return;
109 |           }
110 | 
111 |           String cleaned = Jsoup.parse(content).text();
112 |           List<String> tokens = AnalyzerUtils.parse(ANALYZER, cleaned);
113 | 
114 |           int len = 0;
115 |           int[] termids = new int[tokens.size()];
116 |           for (String token : tokens) {
117 |             int id = dictionary.getId(token);
118 |             if (id != -1) {
119 |               termids[len] = id;
120 |               len++;
121 |             }
122 |           }
123 | 
124 |           PForDocVector.toIntArrayWritable(DOC, termids, len);
125 |           context.write(DOCID, DOC);
126 |         } catch (Exception e) {
127 |           // If Jsoup throws any exceptions, catch and move on, but emit empty doc.
128 |           LOG.info("Error caught processing " + docid);
129 |           DOC.setArray(new int[] {}); // Clean up possible corrupted data
130 |           context.getCounter(Records.ERRORS).increment(1);
131 |           PForDocVector.toIntArrayWritable(DOC, new int[] {}, 0);
132 |           context.write(DOCID, DOC);
133 |         }
134 |       }
135 |     }
136 |   }
137 | 
138 |   public static final String INPUT_OPTION = "input";
139 |   public static final String OUTPUT_OPTION = "output";
140 |   public static final String DICTIONARY_OPTION = "dictionary";
141 |   public static final String REDUCERS_OPTION = "reducers";
142 |   public static final String PREPROCESSING = "preprocessing";
143 | 
144 |   /**
145 |    * Runs this tool.
146 |    */
147 |   @SuppressWarnings("static-access")
148 |   public int run(String[] args) throws Exception {
149 |     Options options = new Options();
150 | 
151 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
152 |         .withDescription("input path").create(INPUT_OPTION));
153 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
154 |         .withDescription("output path").create(OUTPUT_OPTION));
155 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
156 |         .withDescription("dictionary").create(DICTIONARY_OPTION));
157 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
158 |         .withDescription("number of reducers").create(REDUCERS_OPTION));
159 |     options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
160 |         .withDescription("preprocessing").create(PREPROCESSING));
161 | 
162 |     CommandLine cmdline;
163 |     CommandLineParser parser = new GnuParser();
164 |     try {
165 |       cmdline = parser.parse(options, args);
166 |     } catch (ParseException exp) {
167 |       HelpFormatter formatter = new HelpFormatter();
168 |       formatter.printHelp(this.getClass().getName(), options);
169 |       ToolRunner.printGenericCommandUsage(System.out);
170 |       System.err.println("Error parsing command line: " + exp.getMessage());
171 |       return -1;
172 |     }
173 | 
174 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
175 |         || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(PREPROCESSING)) {
176 |       HelpFormatter formatter = new HelpFormatter();
177 |       formatter.printHelp(this.getClass().getName(), options);
178 |       ToolRunner.printGenericCommandUsage(System.out);
179 |       return -1;
180 |     }
181 | 
182 |     String input = cmdline.getOptionValue(INPUT_OPTION);
183 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
184 |     String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
185 |     String preprocessing = cmdline.getOptionValue(PREPROCESSING);
186 | 
187 |     Job job = new Job(getConf(), BuildPForDocVectors.class.getSimpleName() + ":" + input);
188 |     job.setJarByClass(BuildPForDocVectors.class);
189 | 
190 |     LOG.info("Tool name: " + BuildPForDocVectors.class.getSimpleName());
191 |     LOG.info(" - input: " + input);
192 |     LOG.info(" - output: " + output);
193 |     LOG.info(" - dictionary: " + dictionary);
194 |     LOG.info(" - preprocessing: " + preprocessing);
195 | 
196 |     if (cmdline.hasOption(REDUCERS_OPTION)) {
197 |       int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION));
198 |       LOG.info(" - reducers: " + numReducers);
199 |       job.setNumReduceTasks(numReducers);
200 |     } else {
201 |       job.setNumReduceTasks(0);
202 |     }
203 | 
204 |     FileInputFormat.setInputPaths(job, input);
205 |     FileOutputFormat.setOutputPath(job, new Path(output));
206 | 
207 |     job.getConfiguration().set(DICTIONARY_OPTION, dictionary);
208 |     job.getConfiguration().set(PREPROCESSING, preprocessing);
209 | 
210 |     job.setInputFormatClass(ClueWeb12InputFormat.class);
211 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
212 | 
213 |     job.setMapOutputKeyClass(Text.class);
214 |     job.setMapOutputValueClass(IntArrayWritable.class);
215 |     job.setOutputKeyClass(Text.class);
216 |     job.setOutputValueClass(IntArrayWritable.class);
217 | 
218 |     job.setMapperClass(MyMapper.class);
219 | 
220 |     FileSystem.get(getConf()).delete(new Path(output), true);
221 | 
222 |     long startTime = System.currentTimeMillis();
223 |     job.waitForCompletion(true);
224 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
225 | 
226 |     return 0;
227 |   }
228 | 
229 |   /**
230 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
231 |    */
232 |   public static void main(String[] args) throws Exception {
233 |     LOG.info("Running " + BuildPForDocVectors.class.getCanonicalName() + " with args "
234 |         + Arrays.toString(args));
235 |     ToolRunner.run(new BuildPForDocVectors(), args);
236 |   }
237 | }
238 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/BuildVByteDocVectors.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.BytesWritable;
 34 | import org.apache.hadoop.io.LongWritable;
 35 | import org.apache.hadoop.io.Text;
 36 | import org.apache.hadoop.mapreduce.Job;
 37 | import org.apache.hadoop.mapreduce.Mapper;
 38 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 40 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 41 | import org.apache.hadoop.util.Tool;
 42 | import org.apache.hadoop.util.ToolRunner;
 43 | import org.apache.log4j.Logger;
 44 | import org.apache.lucene.analysis.Analyzer;
 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 46 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat;
 47 | import org.clueweb.data.VByteDocVector;
 48 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary;
 49 | import org.clueweb.util.AnalyzerFactory;
 50 | import org.jsoup.Jsoup;
 51 | 
 52 | import tl.lin.lucene.AnalyzerUtils;
 53 | 
 54 | public class BuildVByteDocVectors extends Configured implements Tool {
 55 |   private static final Logger LOG = Logger.getLogger(BuildVByteDocVectors.class);
 56 | 
 57 |   private static enum Records {
 58 |     TOTAL, PAGES, ERRORS, TOO_LONG
 59 |   };
 60 | 
 61 |   private static Analyzer ANALYZER;
 62 | 
 63 |   private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this.
 64 | 
 65 |   private static class MyMapper extends
 66 |       Mapper<LongWritable, ClueWeb12WarcRecord, Text, BytesWritable> {
 67 |     private static final Text DOCID = new Text();
 68 |     private static final BytesWritable DOC = new BytesWritable();
 69 | 
 70 |     private DefaultFrequencySortedDictionary dictionary;
 71 | 
 72 |     @Override
 73 |     public void setup(Context context) throws IOException {
 74 |       FileSystem fs = FileSystem.get(context.getConfiguration());
 75 |       String path = context.getConfiguration().get(DICTIONARY_OPTION);
 76 |       dictionary = new DefaultFrequencySortedDictionary(path, fs);
 77 | 
 78 |       String analyzerType = context.getConfiguration().get(PREPROCESSING);
 79 |       ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType);
 80 |       if (ANALYZER == null) {
 81 |         LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName());
 82 |         System.exit(1);
 83 |       }
 84 |     }
 85 | 
 86 |     @Override
 87 |     public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context)
 88 |         throws IOException, InterruptedException {
 89 |       context.getCounter(Records.TOTAL).increment(1);
 90 | 
 91 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 92 |       if (docid != null) {
 93 |         DOCID.set(docid);
 94 | 
 95 |         context.getCounter(Records.PAGES).increment(1);
 96 |         try {
 97 |           String content = doc.getContent();
 98 | 
 99 |           // If the document is excessively long, it usually means that something is wrong (e.g., a
100 |           // binary object). Skip so the parsing doesn't choke.
101 |           // As an alternative, we might want to consider putting in a timeout, e.g.,
102 |           // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread
103 |           if (content.length() > MAX_DOC_LENGTH) {
104 |             DOC.set(new byte[] {}, 0, 0); // Clean up possible corrupted data
105 |             context.getCounter(Records.TOO_LONG).increment(1);
106 |             VByteDocVector.toBytesWritable(DOC, new int[] {}, 0);
107 |             context.write(DOCID, DOC);
108 |             return;
109 |           }
110 | 
111 |           String cleaned = Jsoup.parse(content).text();
112 |           List<String> tokens = AnalyzerUtils.parse(ANALYZER, cleaned);
113 | 
114 |           int len = 0;
115 |           int[] termids = new int[tokens.size()];
116 |           for (String token : tokens) {
117 |             int id = dictionary.getId(token);
118 |             if (id != -1) {
119 |               termids[len] = id;
120 |               len++;
121 |             }
122 |           }
123 | 
124 |           VByteDocVector.toBytesWritable(DOC, termids, len);
125 |           context.write(DOCID, DOC);
126 |         } catch (Exception e) {
127 |           // If Jsoup throws any exceptions, catch and move on, but emit empty doc.
128 |           LOG.info("Error caught processing " + docid);
129 |           context.getCounter(Records.ERRORS).increment(1);
130 |           VByteDocVector.toBytesWritable(DOC, new int[] {}, 0);
131 |           context.write(DOCID, DOC);
132 |         }
133 |       }
134 |     }
135 |   }
136 | 
137 |   public static final String INPUT_OPTION = "input";
138 |   public static final String OUTPUT_OPTION = "output";
139 |   public static final String DICTIONARY_OPTION = "dictionary";
140 |   public static final String REDUCERS_OPTION = "reducers";
141 |   public static final String PREPROCESSING = "preprocessing";
142 | 
143 |   /**
144 |    * Runs this tool.
145 |    */
146 |   @SuppressWarnings("static-access")
147 |   public int run(String[] args) throws Exception {
148 |     Options options = new Options();
149 | 
150 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
151 |         .withDescription("input path").create(INPUT_OPTION));
152 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
153 |         .withDescription("output path").create(OUTPUT_OPTION));
154 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
155 |         .withDescription("dictionary").create(DICTIONARY_OPTION));
156 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
157 |         .withDescription("number of reducers").create(REDUCERS_OPTION));
158 |     options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
159 |         .withDescription("preprocessing").create(PREPROCESSING));
160 | 
161 |     CommandLine cmdline;
162 |     CommandLineParser parser = new GnuParser();
163 |     try {
164 |       cmdline = parser.parse(options, args);
165 |     } catch (ParseException exp) {
166 |       HelpFormatter formatter = new HelpFormatter();
167 |       formatter.printHelp(this.getClass().getName(), options);
168 |       ToolRunner.printGenericCommandUsage(System.out);
169 |       System.err.println("Error parsing command line: " + exp.getMessage());
170 |       return -1;
171 |     }
172 | 
173 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
174 |         || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(PREPROCESSING)) {
175 |       HelpFormatter formatter = new HelpFormatter();
176 |       formatter.printHelp(this.getClass().getName(), options);
177 |       ToolRunner.printGenericCommandUsage(System.out);
178 |       return -1;
179 |     }
180 | 
181 |     String input = cmdline.getOptionValue(INPUT_OPTION);
182 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
183 |     String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
184 |     String preprocessing = cmdline.getOptionValue(PREPROCESSING);
185 | 
186 |     Job job = new Job(getConf(), BuildVByteDocVectors.class.getSimpleName() + ":" + input);
187 |     job.setJarByClass(BuildVByteDocVectors.class);
188 | 
189 |     LOG.info("Tool name: " + BuildVByteDocVectors.class.getSimpleName());
190 |     LOG.info(" - input: " + input);
191 |     LOG.info(" - output: " + output);
192 |     LOG.info(" - dictionary: " + dictionary);
193 |     LOG.info(" - preprocessing: " + preprocessing);
194 | 
195 |     if (cmdline.hasOption(REDUCERS_OPTION)) {
196 |       int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION));
197 |       LOG.info(" - reducers: " + numReducers);
198 |       job.setNumReduceTasks(numReducers);
199 |     } else {
200 |       job.setNumReduceTasks(0);
201 |     }
202 | 
203 |     FileInputFormat.setInputPaths(job, input);
204 |     FileOutputFormat.setOutputPath(job, new Path(output));
205 | 
206 |     job.getConfiguration().set(DICTIONARY_OPTION, dictionary);
207 |     job.getConfiguration().set(PREPROCESSING, preprocessing);
208 | 
209 |     job.setInputFormatClass(ClueWeb12InputFormat.class);
210 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
211 | 
212 |     job.setMapOutputKeyClass(Text.class);
213 |     job.setMapOutputValueClass(BytesWritable.class);
214 |     job.setOutputKeyClass(Text.class);
215 |     job.setOutputValueClass(BytesWritable.class);
216 | 
217 |     job.setMapperClass(MyMapper.class);
218 | 
219 |     FileSystem.get(getConf()).delete(new Path(output), true);
220 | 
221 |     long startTime = System.currentTimeMillis();
222 |     job.waitForCompletion(true);
223 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
224 | 
225 |     return 0;
226 |   }
227 | 
228 |   /**
229 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
230 |    */
231 |   public static void main(String[] args) throws Exception {
232 |     LOG.info("Running " + BuildVByteDocVectors.class.getCanonicalName() + " with args "
233 |         + Arrays.toString(args));
234 |     ToolRunner.run(new BuildVByteDocVectors(), args);
235 |   }
236 | }
237 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/BuildWarcTrecIdMapping.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.BufferedReader;
 20 | import java.io.File;
 21 | import java.io.FileInputStream;
 22 | import java.io.IOException;
 23 | import java.io.InputStreamReader;
 24 | import java.io.PrintStream;
 25 | import java.util.concurrent.ExecutorService;
 26 | import java.util.concurrent.Executors;
 27 | 
 28 | import org.apache.commons.cli.CommandLine;
 29 | import org.apache.commons.cli.CommandLineParser;
 30 | import org.apache.commons.cli.GnuParser;
 31 | import org.apache.commons.cli.HelpFormatter;
 32 | import org.apache.commons.cli.Option;
 33 | import org.apache.commons.cli.OptionBuilder;
 34 | import org.apache.commons.cli.Options;
 35 | import org.apache.commons.cli.ParseException;
 36 | import org.apache.log4j.Logger;
 37 | import org.apache.lucene.analysis.Analyzer;
 38 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 39 | import org.apache.lucene.document.Document;
 40 | import org.apache.lucene.document.Field;
 41 | import org.apache.lucene.document.FieldType;
 42 | import org.apache.lucene.index.FieldInfo.IndexOptions;
 43 | import org.apache.lucene.index.IndexWriter;
 44 | import org.apache.lucene.index.IndexWriterConfig;
 45 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 46 | import org.apache.lucene.store.Directory;
 47 | import org.apache.lucene.store.FSDirectory;
 48 | import org.apache.lucene.util.Version;
 49 | import org.apache.tools.bzip2.CBZip2InputStream;
 50 | import org.clueweb.data.WarcTrecIdMapping;
 51 | 
 52 | public class BuildWarcTrecIdMapping {
 53 |   private static final Logger LOG = Logger.getLogger(BuildWarcTrecIdMapping.class);
 54 | 
 55 |   public static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43);
 56 | 
 57 |   static final FieldType FIELD_OPTIONS = new FieldType();
 58 | 
 59 |   static {
 60 |     FIELD_OPTIONS.setIndexed(true);
 61 |     FIELD_OPTIONS.setIndexOptions(IndexOptions.DOCS_ONLY);
 62 |     FIELD_OPTIONS.setStored(true);
 63 |     FIELD_OPTIONS.setTokenized(false);        
 64 |   }
 65 | 
 66 |   private static final int DEFAULT_NUM_THREADS = 4;
 67 | 
 68 |   private static final String INPUT_OPTION = "input";
 69 |   private static final String INDEX_OPTION = "index";
 70 |   private static final String MAX_OPTION = "max";
 71 |   private static final String OPTIMIZE_OPTION = "optimize";
 72 |   private static final String THREADS_OPTION = "threads";
 73 | 
 74 |   @SuppressWarnings("static-access")
 75 |   public static void main(String[] args) throws Exception {
 76 |     Options options = new Options();
 77 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 78 |         .withDescription("bz2 Wikipedia XML dump file").create(INPUT_OPTION));
 79 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
 80 |         .withDescription("index location").create(INDEX_OPTION));
 81 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 82 |         .withDescription("maximum number of documents to index").create(MAX_OPTION));
 83 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
 84 |         .withDescription("number of indexing threads").create(THREADS_OPTION));
 85 | 
 86 |     options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
 87 | 
 88 |     CommandLine cmdline = null;
 89 |     CommandLineParser parser = new GnuParser();
 90 |     try {
 91 |       cmdline = parser.parse(options, args);
 92 |     } catch (ParseException exp) {
 93 |       System.err.println("Error parsing command line: " + exp.getMessage());
 94 |       System.exit(-1);
 95 |     }
 96 | 
 97 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
 98 |       HelpFormatter formatter = new HelpFormatter();
 99 |       formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options);
100 |       System.exit(-1);
101 |     }
102 | 
103 |     String indexPath = cmdline.getOptionValue(INDEX_OPTION);
104 |     int maxdocs = cmdline.hasOption(MAX_OPTION) ?
105 |         Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE;
106 |     int threads = cmdline.hasOption(THREADS_OPTION) ?
107 |         Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS;
108 | 
109 |     long startTime = System.currentTimeMillis();
110 | 
111 |     String path = cmdline.getOptionValue(INPUT_OPTION);
112 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
113 | 
114 |     Directory dir = FSDirectory.open(new File(indexPath));
115 |     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
116 |     config.setOpenMode(OpenMode.CREATE);
117 | 
118 |     IndexWriter writer = new IndexWriter(dir, config);
119 |     LOG.info("Creating index at " + indexPath);
120 |     LOG.info("Indexing with " + threads + " threads");
121 | 
122 |     FileInputStream fis = null;
123 |     BufferedReader br = null;
124 |     
125 |     try {
126 |       fis = new FileInputStream(new File(path));
127 |       byte[] ignoreBytes = new byte[2];
128 |       fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools
129 |       br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8"));
130 | 
131 |       ExecutorService executor = Executors.newFixedThreadPool(threads);
132 |       int cnt = 0;
133 |       String s;
134 |       while ((s = br.readLine()) != null) {
135 |         Runnable worker = new AddDocumentRunnable(writer, s);
136 |         executor.execute(worker);
137 | 
138 |         cnt++;
139 |         if (cnt % 1000000 == 0) {
140 |           LOG.info(cnt + " articles added");
141 |         }
142 |         if (cnt >= maxdocs) {
143 |           break;
144 |         }
145 |       }
146 | 
147 |       executor.shutdown();
148 |       // Wait until all threads are finish
149 |       while (!executor.isTerminated()) {}
150 | 
151 |       LOG.info("Total of " + cnt + " articles indexed.");
152 | 
153 |       if (cmdline.hasOption(OPTIMIZE_OPTION)) {
154 |         LOG.info("Merging segments...");
155 |         writer.forceMerge(1);
156 |         LOG.info("Done!");
157 |       }
158 | 
159 |       LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
160 |     } catch (Exception e) {
161 |       e.printStackTrace();
162 |     } finally {
163 |       writer.close();
164 |       dir.close();
165 |       out.close();
166 |       br.close();
167 |       fis.close();
168 |     }
169 |   }
170 |   
171 |   private static class AddDocumentRunnable implements Runnable {
172 |     private final IndexWriter writer;
173 |     private final String s;
174 | 
175 |     AddDocumentRunnable(IndexWriter writer, String s) {
176 |       this.writer = writer;
177 |       this.s = s.split(",")[0];
178 |     }
179 | 
180 |     @Override
181 |     public void run() {
182 |       Document doc = new Document();
183 |       doc.add(new Field(WarcTrecIdMapping.IndexField.WARC_TREC_ID.name, s, FIELD_OPTIONS));
184 | 
185 |       try {
186 |         writer.addDocument(doc);
187 |       } catch (IOException e) {
188 |         e.printStackTrace();
189 |       }
190 |     }
191 |   } 
192 | }
193 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/ComputeTermStatistics.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.Map;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.LongWritable;
 34 | import org.apache.hadoop.io.Text;
 35 | import org.apache.hadoop.mapreduce.Job;
 36 | import org.apache.hadoop.mapreduce.Mapper;
 37 | import org.apache.hadoop.mapreduce.Reducer;
 38 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 40 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 41 | import org.apache.hadoop.util.Tool;
 42 | import org.apache.hadoop.util.ToolRunner;
 43 | import org.apache.log4j.Logger;
 44 | import org.apache.lucene.analysis.Analyzer;
 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 46 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat;
 47 | import org.clueweb.util.AnalyzerFactory;
 48 | import org.jsoup.Jsoup;
 49 | 
 50 | import tl.lin.data.pair.PairOfIntLong;
 51 | import tl.lin.lucene.AnalyzerUtils;
 52 | 
 53 | import com.google.common.collect.Maps;
 54 | 
 55 | public class ComputeTermStatistics extends Configured implements Tool {
 56 |   private static final Logger LOG = Logger.getLogger(ComputeTermStatistics.class);
 57 | 
 58 |   private static enum Records {
 59 |     TOTAL, PAGES, ERRORS, SKIPPED
 60 |   };
 61 | 
 62 |   private static Analyzer ANALYZER;
 63 | 
 64 |   private static final String HADOOP_DF_MIN_OPTION = "df.min";
 65 |   private static final String HADOOP_DF_MAX_OPTION = "df.max";
 66 | 
 67 |   private static final int MAX_TOKEN_LENGTH = 64;       // Throw away tokens longer than this.
 68 |   private static final int MIN_DF_DEFAULT = 100;        // Throw away terms with df less than this.
 69 |   private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this.
 70 | 
 71 |   private static class MyMapper extends
 72 |       Mapper<LongWritable, ClueWeb12WarcRecord, Text, PairOfIntLong> {
 73 |     private static final Text term = new Text();
 74 |     private static final PairOfIntLong pair = new PairOfIntLong();
 75 | 
 76 |     @Override
 77 |     public void setup(Context context) throws IOException {
 78 | 
 79 |       String analyzerType = context.getConfiguration().get(PREPROCESSING);
 80 |       ANALYZER = AnalyzerFactory.getAnalyzer(analyzerType);
 81 |       if (ANALYZER == null) {
 82 |         LOG.error("Error: proprocessing type not recognized. Abort " + this.getClass().getName());
 83 |         System.exit(1);
 84 |       }
 85 |     }
 86 | 
 87 |     @Override
 88 |     public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context) throws IOException,
 89 |         InterruptedException {
 90 | 
 91 |       context.getCounter(Records.TOTAL).increment(1);
 92 | 
 93 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 94 |       if (docid != null) {
 95 |         context.getCounter(Records.PAGES).increment(1);
 96 |         try {
 97 |           String content = doc.getContent();
 98 | 
 99 |           // If the document is excessively long, it usually means that something is wrong (e.g., a
100 |           // binary object). Skip so the parsing doesn't choke.
101 |           // As an alternative, we might want to consider putting in a timeout, e.g.,
102 |           // http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread
103 |           if (content.length() > MAX_DOC_LENGTH) {
104 |             LOG.info("Skipping " + docid + " due to excessive length: " + content.length());
105 |             context.getCounter(Records.SKIPPED).increment(1);
106 |             return;
107 |           }
108 | 
109 |           String cleaned = Jsoup.parse(content).text();
110 |           Map<String, Integer> map = Maps.newHashMap();
111 |           for (String term : AnalyzerUtils.parse(ANALYZER, cleaned)) {
112 |             if (term.length() > MAX_TOKEN_LENGTH) {
113 |               continue;
114 |             }
115 | 
116 |             if (map.containsKey(term)) {
117 |               map.put(term, map.get(term) + 1);
118 |             } else {
119 |               map.put(term, 1);
120 |             }
121 |           }
122 | 
123 |           for (Map.Entry<String, Integer> entry : map.entrySet()) {
124 |             term.set(entry.getKey());
125 |             pair.set(1, entry.getValue());
126 |             context.write(term, pair);
127 |           }
128 |         } catch (Exception e) {
129 |           // If Jsoup throws any exceptions, catch and move on.
130 |           LOG.info("Error caught processing " + docid);
131 |           context.getCounter(Records.ERRORS).increment(1);
132 |         }
133 |       }
134 |     }
135 |   }
136 | 
137 |   private static class MyCombiner extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
138 |     private static final PairOfIntLong output = new PairOfIntLong();
139 | 
140 |     @Override
141 |     public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
142 |         throws IOException, InterruptedException {
143 |       int df = 0;
144 |       long cf = 0;
145 |       for (PairOfIntLong pair : values) {
146 |         df += pair.getLeftElement();
147 |         cf += pair.getRightElement();
148 |       }
149 | 
150 |       output.set(df, cf);
151 |       context.write(key, output);
152 |     }
153 |   }
154 | 
155 |   private static class MyReducer extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
156 |     private static final PairOfIntLong output = new PairOfIntLong();
157 |     private int dfMin, dfMax;
158 | 
159 |     @Override
160 |     public void setup(Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) {
161 |       dfMin = context.getConfiguration().getInt(HADOOP_DF_MIN_OPTION, MIN_DF_DEFAULT);
162 |       dfMax = context.getConfiguration().getInt(HADOOP_DF_MAX_OPTION, Integer.MAX_VALUE);
163 |       LOG.info("dfMin = " + dfMin);
164 |     }
165 | 
166 |     @Override
167 |     public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
168 |         throws IOException, InterruptedException {
169 |       int df = 0;
170 |       long cf = 0;
171 |       for (PairOfIntLong pair : values) {
172 |         df += pair.getLeftElement();
173 |         cf += pair.getRightElement();
174 |       }
175 |       if (df < dfMin || df > dfMax) {
176 |         return;
177 |       }
178 |       output.set(df, cf);
179 |       context.write(key, output);
180 |     }
181 |   }
182 | 
183 |   public static final String INPUT_OPTION = "input";
184 |   public static final String OUTPUT_OPTION = "output";
185 |   public static final String DF_MIN_OPTION = "dfMin";
186 |   public static final String PREPROCESSING = "preprocessing";
187 | 
188 |   /**
189 |    * Runs this tool.
190 |    */
191 |   @SuppressWarnings("static-access")
192 |   public int run(String[] args) throws Exception {
193 |     Options options = new Options();
194 | 
195 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
196 |         .withDescription("input path").create(INPUT_OPTION));
197 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
198 |         .withDescription("output path").create(OUTPUT_OPTION));
199 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
200 |         .withDescription("minimum df").create(DF_MIN_OPTION));
201 |     options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
202 |         .withDescription("preprocessing").create(PREPROCESSING));
203 | 
204 |     CommandLine cmdline;
205 |     CommandLineParser parser = new GnuParser();
206 |     try {
207 |       cmdline = parser.parse(options, args);
208 |     } catch (ParseException exp) {
209 |       HelpFormatter formatter = new HelpFormatter();
210 |       formatter.printHelp(this.getClass().getName(), options);
211 |       ToolRunner.printGenericCommandUsage(System.out);
212 |       System.err.println("Error parsing command line: " + exp.getMessage());
213 |       return -1;
214 |     }
215 | 
216 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
217 |         || !cmdline.hasOption(PREPROCESSING)) {
218 |       HelpFormatter formatter = new HelpFormatter();
219 |       formatter.printHelp(this.getClass().getName(), options);
220 |       ToolRunner.printGenericCommandUsage(System.out);
221 |       return -1;
222 |     }
223 | 
224 |     String input = cmdline.getOptionValue(INPUT_OPTION);
225 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
226 |     String preprocessing = cmdline.getOptionValue(PREPROCESSING);
227 | 
228 |     LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName());
229 |     LOG.info(" - input: " + input);
230 |     LOG.info(" - output: " + output);
231 |     LOG.info(" - preprocessing: " + preprocessing);
232 | 
233 |     getConf().set(PREPROCESSING, preprocessing);
234 | 
235 |     Job job = new Job(getConf(), ComputeTermStatistics.class.getSimpleName() + ":" + input);
236 |     job.setJarByClass(ComputeTermStatistics.class);
237 | 
238 |     job.setNumReduceTasks(100);
239 | 
240 |     if (cmdline.hasOption(DF_MIN_OPTION)) {
241 |       int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION));
242 |       LOG.info(" - dfMin: " + dfMin);
243 |       job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin);
244 |     }
245 | 
246 |     FileInputFormat.setInputPaths(job, input);
247 |     FileOutputFormat.setOutputPath(job, new Path(output));
248 | 
249 |     job.setInputFormatClass(ClueWeb12InputFormat.class);
250 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
251 | 
252 |     job.setMapOutputKeyClass(Text.class);
253 |     job.setMapOutputValueClass(PairOfIntLong.class);
254 |     job.setOutputKeyClass(Text.class);
255 |     job.setOutputValueClass(PairOfIntLong.class);
256 | 
257 |     job.setMapperClass(MyMapper.class);
258 |     job.setCombinerClass(MyCombiner.class);
259 |     job.setReducerClass(MyReducer.class);
260 | 
261 |     FileSystem.get(getConf()).delete(new Path(output), true);
262 | 
263 |     long startTime = System.currentTimeMillis();
264 |     job.waitForCompletion(true);
265 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
266 | 
267 |     return 0;
268 |   }
269 | 
270 |   /**
271 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
272 |    */
273 |   public static void main(String[] args) throws Exception {
274 |     LOG.info("Running " + ComputeTermStatistics.class.getCanonicalName() + " with args "
275 |         + Arrays.toString(args));
276 |     ToolRunner.run(new ComputeTermStatistics(), args);
277 |   }
278 | }
279 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsNew.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.io.LongWritable;
 31 | import org.apache.hadoop.io.NullWritable;
 32 | import org.apache.hadoop.mapreduce.Counters;
 33 | import org.apache.hadoop.mapreduce.Job;
 34 | import org.apache.hadoop.mapreduce.Mapper;
 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 36 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 37 | import org.apache.hadoop.util.Tool;
 38 | import org.apache.hadoop.util.ToolRunner;
 39 | import org.apache.log4j.Logger;
 40 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 41 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat;
 42 | 
 43 | public class CountWarcRecordsNew extends Configured implements Tool {
 44 |   private static final Logger LOG = Logger.getLogger(CountWarcRecordsNew.class);
 45 | 
 46 |   private static enum Records { TOTAL, PAGES };
 47 | 
 48 |   private static class MyMapper
 49 |       extends Mapper<LongWritable, ClueWeb12WarcRecord, NullWritable, NullWritable> {
 50 |     @Override
 51 |     public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context)
 52 |         throws IOException, InterruptedException {
 53 |       context.getCounter(Records.TOTAL).increment(1);
 54 | 
 55 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 56 |       if (docid != null) {
 57 |         context.getCounter(Records.PAGES).increment(1);
 58 |       }
 59 |     }
 60 |   }
 61 | 
 62 |   public CountWarcRecordsNew() {}
 63 | 
 64 |   public static final String INPUT_OPTION = "input";
 65 | 
 66 |   /**
 67 |    * Runs this tool.
 68 |    */
 69 |   @SuppressWarnings("static-access")
 70 |   public int run(String[] args) throws Exception {
 71 |     Options options = new Options();
 72 | 
 73 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 74 |         .withDescription("input path").create(INPUT_OPTION));
 75 | 
 76 |     CommandLine cmdline;
 77 |     CommandLineParser parser = new GnuParser();
 78 |     try {
 79 |       cmdline = parser.parse(options, args);
 80 |     } catch (ParseException exp) {
 81 |       HelpFormatter formatter = new HelpFormatter();
 82 |       formatter.printHelp(this.getClass().getName(), options);
 83 |       ToolRunner.printGenericCommandUsage(System.out);
 84 |       System.err.println("Error parsing command line: " + exp.getMessage());
 85 |       return -1;
 86 |     }
 87 | 
 88 |     if (!cmdline.hasOption(INPUT_OPTION)) {
 89 |       HelpFormatter formatter = new HelpFormatter();
 90 |       formatter.printHelp(this.getClass().getName(), options);
 91 |       ToolRunner.printGenericCommandUsage(System.out);
 92 |       return -1;
 93 |     }
 94 | 
 95 |     String input = cmdline.getOptionValue(INPUT_OPTION);
 96 | 
 97 |     LOG.info("Tool name: " + CountWarcRecordsNew.class.getSimpleName());
 98 |     LOG.info(" - input: " + input);
 99 | 
100 |     Job job = new Job(getConf(), CountWarcRecordsNew.class.getSimpleName() + ":" + input);
101 |     job.setJarByClass(CountWarcRecordsNew.class);
102 |     job.setNumReduceTasks(0);
103 | 
104 |     FileInputFormat.addInputPaths(job, input);
105 | 
106 |     job.setInputFormatClass(ClueWeb12InputFormat.class);
107 |     job.setOutputFormatClass(NullOutputFormat.class);
108 |     job.setMapperClass(MyMapper.class);
109 | 
110 |     job.waitForCompletion(true);
111 | 
112 |     Counters counters = job.getCounters();
113 |     int numDocs = (int) counters.findCounter(Records.PAGES).getValue();
114 |     LOG.info("Read " + numDocs + " docs.");
115 | 
116 |     return 0;
117 |   }
118 | 
119 |   /**
120 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
121 |    */
122 |   public static void main(String[] args) throws Exception {
123 |     LOG.info("Running " + CountWarcRecordsNew.class.getCanonicalName() + " with args "
124 |         + Arrays.toString(args));
125 |     ToolRunner.run(new CountWarcRecordsNew(), args);
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/CountWarcRecordsOld.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.io.NullWritable;
 31 | import org.apache.hadoop.io.Writable;
 32 | import org.apache.hadoop.mapred.Counters;
 33 | import org.apache.hadoop.mapred.FileInputFormat;
 34 | import org.apache.hadoop.mapred.JobClient;
 35 | import org.apache.hadoop.mapred.JobConf;
 36 | import org.apache.hadoop.mapred.MapReduceBase;
 37 | import org.apache.hadoop.mapred.Mapper;
 38 | import org.apache.hadoop.mapred.OutputCollector;
 39 | import org.apache.hadoop.mapred.Reporter;
 40 | import org.apache.hadoop.mapred.RunningJob;
 41 | import org.apache.hadoop.mapred.lib.NullOutputFormat;
 42 | import org.apache.hadoop.util.Tool;
 43 | import org.apache.hadoop.util.ToolRunner;
 44 | import org.apache.log4j.Logger;
 45 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 46 | import org.clueweb.clueweb12.mapred.ClueWeb12InputFormat;
 47 | 
 48 | public class CountWarcRecordsOld extends Configured implements Tool {
 49 |   private static final Logger LOG = Logger.getLogger(CountWarcRecordsOld.class);
 50 | 
 51 |   private static enum Records { TOTAL, PAGES };
 52 | 
 53 |   private static class MyMapper extends MapReduceBase implements
 54 |       Mapper<Writable, ClueWeb12WarcRecord, NullWritable, NullWritable> {
 55 | 
 56 |     public void configure(JobConf job) {}
 57 | 
 58 |     public void map(Writable key, ClueWeb12WarcRecord doc,
 59 |         OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException {
 60 |       reporter.incrCounter(Records.TOTAL, 1);
 61 | 
 62 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 63 |       if (docid != null) {
 64 |         reporter.incrCounter(Records.PAGES, 1);
 65 |       }
 66 |     }
 67 |   }
 68 | 
 69 |   public CountWarcRecordsOld() {
 70 |   }
 71 | 
 72 |   public static final String INPUT_OPTION = "input";
 73 | 
 74 |   /**
 75 |    * Runs this tool.
 76 |    */
 77 |   @SuppressWarnings("static-access")
 78 |   public int run(String[] args) throws Exception {
 79 |     Options options = new Options();
 80 | 
 81 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 82 |         .withDescription("input path").create(INPUT_OPTION));
 83 | 
 84 |     CommandLine cmdline;
 85 |     CommandLineParser parser = new GnuParser();
 86 |     try {
 87 |       cmdline = parser.parse(options, args);
 88 |     } catch (ParseException exp) {
 89 |       HelpFormatter formatter = new HelpFormatter();
 90 |       formatter.printHelp(this.getClass().getName(), options);
 91 |       ToolRunner.printGenericCommandUsage(System.out);
 92 |       System.err.println("Error parsing command line: " + exp.getMessage());
 93 |       return -1;
 94 |     }
 95 | 
 96 |     if (!cmdline.hasOption(INPUT_OPTION)) {
 97 |       HelpFormatter formatter = new HelpFormatter();
 98 |       formatter.printHelp(this.getClass().getName(), options);
 99 |       ToolRunner.printGenericCommandUsage(System.out);
100 |       return -1;
101 |     }
102 | 
103 |     String input = cmdline.getOptionValue(INPUT_OPTION);
104 | 
105 |     LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName());
106 |     LOG.info(" - input: " + input);
107 | 
108 |     JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class);
109 |     conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input);
110 | 
111 |     conf.setNumReduceTasks(0);
112 | 
113 |     FileInputFormat.addInputPaths(conf, input);
114 | 
115 |     conf.setInputFormat(ClueWeb12InputFormat.class);
116 |     conf.setOutputFormat(NullOutputFormat.class);
117 |     conf.setMapperClass(MyMapper.class);
118 | 
119 |     RunningJob job = JobClient.runJob(conf);
120 |     Counters counters = job.getCounters();
121 |     int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
122 | 
123 |     LOG.info("Read " + numDocs + " docs.");
124 | 
125 |     return 0;
126 |   }
127 | 
128 |   /**
129 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
130 |    */
131 |   public static void main(String[] args) throws Exception {
132 |     LOG.info("Running " + CountWarcRecordsOld.class.getCanonicalName() + " with args "
133 |         + Arrays.toString(args));
134 |     ToolRunner.run(new CountWarcRecordsOld(), args);
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToPlainText.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.fs.Path;
 31 | import org.apache.hadoop.io.Text;
 32 | import org.apache.hadoop.io.Writable;
 33 | import org.apache.hadoop.mapred.Counters;
 34 | import org.apache.hadoop.mapred.FileInputFormat;
 35 | import org.apache.hadoop.mapred.FileOutputFormat;
 36 | import org.apache.hadoop.mapred.JobClient;
 37 | import org.apache.hadoop.mapred.JobConf;
 38 | import org.apache.hadoop.mapred.MapReduceBase;
 39 | import org.apache.hadoop.mapred.Mapper;
 40 | import org.apache.hadoop.mapred.OutputCollector;
 41 | import org.apache.hadoop.mapred.Reporter;
 42 | import org.apache.hadoop.mapred.RunningJob;
 43 | import org.apache.hadoop.mapred.TextOutputFormat;
 44 | import org.apache.hadoop.util.Tool;
 45 | import org.apache.hadoop.util.ToolRunner;
 46 | import org.apache.log4j.Logger;
 47 | import org.apache.lucene.analysis.Analyzer;
 48 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 49 | import org.apache.lucene.util.Version;
 50 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 51 | import org.clueweb.clueweb12.mapred.ClueWeb12InputFormat;
 52 | import org.jsoup.Jsoup;
 53 | 
 54 | import tl.lin.lucene.AnalyzerUtils;
 55 | 
 56 | import com.google.common.base.Joiner;
 57 | 
 58 | public class DumpWarcRecordsToPlainText extends Configured implements Tool {
 59 |   private static final Logger LOG = Logger.getLogger(DumpWarcRecordsToPlainText.class);
 60 | 
 61 |   private static enum Records { TOTAL, PAGES, ERRORS };
 62 |   private static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43);
 63 |   private static final Joiner JOINER = Joiner.on("|");
 64 | 
 65 |   private static class MyMapper extends MapReduceBase implements
 66 |       Mapper<Writable, ClueWeb12WarcRecord, Text, Text> {
 67 |     private static final Text KEY = new Text();
 68 |     private static final Text VALUE = new Text();
 69 | 
 70 |     public void configure(JobConf job) {}
 71 | 
 72 |     public void map(Writable key, ClueWeb12WarcRecord doc, OutputCollector<Text, Text> output,
 73 |         Reporter reporter) throws IOException {
 74 |       reporter.incrCounter(Records.TOTAL, 1);
 75 | 
 76 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 77 |       if (docid != null) {
 78 |         reporter.incrCounter(Records.PAGES, 1);
 79 |         try {
 80 |           KEY.set(docid);
 81 |           String cleaned = Jsoup.parse(doc.getContent()).text().replaceAll("[\\r\\n]+", " ");
 82 |           cleaned = JOINER.join(AnalyzerUtils.parse(ANALYZER, cleaned));
 83 |           VALUE.set(cleaned);
 84 |           output.collect(KEY, VALUE);
 85 |         } catch (Exception e) {
 86 |           // If Jsoup throws any exceptions, catch and move on.
 87 |           reporter.incrCounter(Records.ERRORS, 1);
 88 |         }
 89 |       }
 90 |     }
 91 |   }
 92 | 
 93 |   public DumpWarcRecordsToPlainText() {}
 94 | 
 95 |   public static final String INPUT_OPTION = "input";
 96 |   public static final String OUTPUT_OPTION = "output";
 97 | 
 98 |   /**
 99 |    * Runs this tool.
100 |    */
101 |   @SuppressWarnings("static-access")
102 |   public int run(String[] args) throws Exception {
103 |     Options options = new Options();
104 | 
105 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
106 |         .withDescription("input path").create(INPUT_OPTION));
107 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
108 |         .withDescription("output path").create(OUTPUT_OPTION));
109 | 
110 |     CommandLine cmdline;
111 |     CommandLineParser parser = new GnuParser();
112 |     try {
113 |       cmdline = parser.parse(options, args);
114 |     } catch (ParseException exp) {
115 |       HelpFormatter formatter = new HelpFormatter();
116 |       formatter.printHelp(this.getClass().getName(), options);
117 |       ToolRunner.printGenericCommandUsage(System.out);
118 |       System.err.println("Error parsing command line: " + exp.getMessage());
119 |       return -1;
120 |     }
121 | 
122 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
123 |       HelpFormatter formatter = new HelpFormatter();
124 |       formatter.printHelp(this.getClass().getName(), options);
125 |       ToolRunner.printGenericCommandUsage(System.out);
126 |       return -1;
127 |     }
128 | 
129 |     String input = cmdline.getOptionValue(INPUT_OPTION);
130 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
131 | 
132 |     LOG.info("Tool name: " + DumpWarcRecordsToPlainText.class.getSimpleName());
133 |     LOG.info(" - input: " + input);
134 |     LOG.info(" - output: " + output);
135 | 
136 |     JobConf conf = new JobConf(getConf(), DumpWarcRecordsToPlainText.class);
137 |     conf.setJobName(DumpWarcRecordsToPlainText.class.getSimpleName() + ":" + input);
138 | 
139 |     conf.setNumReduceTasks(0);
140 | 
141 |     FileInputFormat.addInputPaths(conf, input);
142 |     FileOutputFormat.setOutputPath(conf, new Path(output));
143 | 
144 |     conf.setInputFormat(ClueWeb12InputFormat.class);
145 |     conf.setOutputFormat(TextOutputFormat.class);
146 |     conf.setMapperClass(MyMapper.class);
147 | 
148 |     RunningJob job = JobClient.runJob(conf);
149 |     Counters counters = job.getCounters();
150 |     int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
151 | 
152 |     LOG.info("Read " + numDocs + " docs.");
153 | 
154 |     return 0;
155 |   }
156 | 
157 |   /**
158 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
159 |    */
160 |   public static void main(String[] args) throws Exception {
161 |     LOG.info("Running " + DumpWarcRecordsToPlainText.class.getCanonicalName() + " with args "
162 |         + Arrays.toString(args));
163 |     ToolRunner.run(new DumpWarcRecordsToPlainText(), args);
164 |   }
165 | }


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/DumpWarcRecordsToTermIds.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.LongWritable;
 34 | import org.apache.hadoop.io.Text;
 35 | import org.apache.hadoop.mapreduce.Job;
 36 | import org.apache.hadoop.mapreduce.Mapper;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 40 | import org.apache.hadoop.util.Tool;
 41 | import org.apache.hadoop.util.ToolRunner;
 42 | import org.apache.log4j.Logger;
 43 | import org.apache.lucene.analysis.Analyzer;
 44 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 45 | import org.apache.lucene.util.Version;
 46 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 47 | import org.clueweb.clueweb12.mapreduce.ClueWeb12InputFormat;
 48 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary;
 49 | import org.jsoup.Jsoup;
 50 | 
 51 | import tl.lin.lucene.AnalyzerUtils;
 52 | 
 53 | public class DumpWarcRecordsToTermIds extends Configured implements Tool {
 54 |   private static final Logger LOG = Logger.getLogger(DumpWarcRecordsToTermIds.class);
 55 | 
 56 |   private static enum Records { TOTAL, PAGES, ERRORS, TOO_LONG };
 57 | 
 58 |   private static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43);
 59 | 
 60 |   private static final int MAX_DOC_LENGTH = 512 * 1024; // Skip document if long than this.
 61 | 
 62 |   private static class MyMapper extends Mapper<LongWritable, ClueWeb12WarcRecord, Text, Text> {
 63 |     private static final Text DOCID = new Text();
 64 |     private static final Text DOC = new Text();
 65 |     private static final Text EMPTY = new Text();
 66 | 
 67 |     private DefaultFrequencySortedDictionary dictionary;
 68 | 
 69 |     @Override
 70 |     public void setup(Context context) throws IOException {
 71 |       FileSystem fs = FileSystem.get(context.getConfiguration());
 72 |       String path = context.getConfiguration().get(DICTIONARY_OPTION);
 73 |       dictionary = new DefaultFrequencySortedDictionary(path, fs);
 74 |     }
 75 | 
 76 |     @Override
 77 |     public void map(LongWritable key, ClueWeb12WarcRecord doc, Context context)
 78 |         throws IOException, InterruptedException {
 79 |       
 80 |       context.getCounter(Records.TOTAL).increment(1);
 81 | 
 82 |       String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
 83 |       if (docid != null) {
 84 |         DOCID.set(docid);
 85 | 
 86 |         context.getCounter(Records.PAGES).increment(1);
 87 |         try {
 88 |           String content = doc.getContent();
 89 | 
 90 |           // If the document is excessively long, it usually means that something is wrong (e.g., a
 91 |           // binary object). Skip so the parsing doesn't choke.
 92 |           // As an alternative, we might want to consider putting in a timeout, e.g.,
 93 |           //    http://stackoverflow.com/questions/2275443/how-to-timeout-a-thread
 94 |           if ( content.length() > MAX_DOC_LENGTH ) {
 95 |             LOG.info("Skipping " + docid + " due to excessive length: " + content.length());
 96 |             context.getCounter(Records.TOO_LONG).increment(1);
 97 |             context.write(DOCID, EMPTY);
 98 |             return;
 99 |           }
100 | 
101 |           String cleaned = Jsoup.parse(content).text();
102 |           List<String> tokens = AnalyzerUtils.parse(ANALYZER, cleaned);
103 | 
104 |           int len = 0;
105 |           int[] termids = new int[tokens.size()];
106 |           for (String token : tokens) {
107 |             int id = dictionary.getId(token);
108 |             if (id != -1) {
109 |               termids[len] = id;
110 |               len++;
111 |             }
112 |           }
113 | 
114 |           int[] copy = new int[len];
115 |           System.arraycopy(termids, 0, copy, 0, len);
116 |           DOC.set(Arrays.toString(copy));
117 |           context.write(DOCID, DOC);
118 |         }
119 |         catch (Exception e) {
120 |           // If Jsoup throws any exceptions, catch and move on, but emit empty doc.
121 |           LOG.info("Error caught processing " + docid);
122 |           context.getCounter(Records.ERRORS).increment(1);
123 |           context.write(DOCID, EMPTY);
124 |         }
125 |       }
126 |     }
127 |   }
128 | 
129 |   public static final String INPUT_OPTION = "input";
130 |   public static final String OUTPUT_OPTION = "output";
131 |   public static final String DICTIONARY_OPTION = "dictionary";
132 |   public static final String REDUCERS_OPTION = "reducers";
133 | 
134 |   /**
135 |    * Runs this tool.
136 |    */
137 |   @SuppressWarnings("static-access")
138 |   public int run(String[] args) throws Exception {
139 |     Options options = new Options();
140 | 
141 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
142 |         .withDescription("input path").create(INPUT_OPTION));
143 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
144 |         .withDescription("output path").create(OUTPUT_OPTION));
145 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
146 |         .withDescription("dictionary").create(DICTIONARY_OPTION));
147 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
148 |         .withDescription("number of reducers").create(REDUCERS_OPTION));
149 | 
150 |     CommandLine cmdline;
151 |     CommandLineParser parser = new GnuParser();
152 |     try {
153 |       cmdline = parser.parse(options, args);
154 |     } catch (ParseException exp) {
155 |       HelpFormatter formatter = new HelpFormatter();
156 |       formatter.printHelp(this.getClass().getName(), options);
157 |       ToolRunner.printGenericCommandUsage(System.out);
158 |       System.err.println("Error parsing command line: " + exp.getMessage());
159 |       return -1;
160 |     }
161 | 
162 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) ||
163 |         !cmdline.hasOption(DICTIONARY_OPTION)) {
164 |       HelpFormatter formatter = new HelpFormatter();
165 |       formatter.printHelp(this.getClass().getName(), options);
166 |       ToolRunner.printGenericCommandUsage(System.out);
167 |       return -1;
168 |     }
169 | 
170 |     String input = cmdline.getOptionValue(INPUT_OPTION);
171 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
172 |     String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
173 | 
174 |     Job job = new Job(getConf(), DumpWarcRecordsToTermIds.class.getSimpleName() + ":" + input);
175 |     job.setJarByClass(DumpWarcRecordsToTermIds.class);
176 | 
177 |     LOG.info("Tool name: " + DumpWarcRecordsToTermIds.class.getSimpleName());
178 |     LOG.info(" - input: " + input);
179 |     LOG.info(" - output: " + output);
180 |     LOG.info(" - dictionary: " + dictionary);
181 | 
182 |     if (cmdline.hasOption(REDUCERS_OPTION)) {
183 |       int numReducers = Integer.parseInt(cmdline.getOptionValue(REDUCERS_OPTION));
184 |       LOG.info(" - reducers: " + numReducers);
185 |       job.setNumReduceTasks(numReducers);
186 |     } else {
187 |       job.setNumReduceTasks(0);
188 |     }
189 | 
190 |     FileInputFormat.setInputPaths(job, input);
191 |     FileOutputFormat.setOutputPath(job, new Path(output));
192 | 
193 |     job.getConfiguration().set(DICTIONARY_OPTION, dictionary);
194 | 
195 |     job.setInputFormatClass(ClueWeb12InputFormat.class);
196 |     job.setOutputFormatClass(TextOutputFormat.class);
197 | 
198 |     job.setMapOutputKeyClass(Text.class);
199 |     job.setMapOutputValueClass(Text.class);
200 |     job.setOutputKeyClass(Text.class);
201 |     job.setOutputValueClass(Text.class);
202 | 
203 |     job.setMapperClass(MyMapper.class);
204 | 
205 |     FileSystem.get(getConf()).delete(new Path(output), true);
206 | 
207 |     long startTime = System.currentTimeMillis();
208 |     job.waitForCompletion(true);
209 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
210 | 
211 |     return 0;
212 |   }
213 | 
214 |   /**
215 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
216 |    */
217 |   public static void main(String[] args) throws Exception {
218 |     LOG.info("Running " + DumpWarcRecordsToTermIds.class.getCanonicalName() + " with args "
219 |         + Arrays.toString(args));
220 |     ToolRunner.run(new DumpWarcRecordsToTermIds(), args);
221 |   }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/LookupWarcTrecIdMapping.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.clueweb12.app;
18 | 
19 | import org.apache.commons.cli.CommandLine;
20 | import org.apache.commons.cli.CommandLineParser;
21 | import org.apache.commons.cli.GnuParser;
22 | import org.apache.commons.cli.HelpFormatter;
23 | import org.apache.commons.cli.OptionBuilder;
24 | import org.apache.commons.cli.Options;
25 | import org.apache.commons.cli.ParseException;
26 | import org.apache.hadoop.conf.Configured;
27 | import org.apache.hadoop.fs.Path;
28 | import org.apache.hadoop.util.Tool;
29 | import org.apache.hadoop.util.ToolRunner;
30 | import org.clueweb.data.WarcTrecIdMapping;
31 | 
32 | public class LookupWarcTrecIdMapping extends Configured implements Tool {
33 |   private static final String INDEX_OPTION = "index";
34 |   private static final String DOCID_OPTION = "docid";
35 |   private static final String DOCNO_OPTION = "docno";
36 | 
37 |   @SuppressWarnings("static-access")
38 |   public int run(String[] args) throws Exception {
39 |     Options options = new Options();
40 |     options.addOption(OptionBuilder.withArgName("dir").hasArg()
41 |         .withDescription("index location").create(INDEX_OPTION));
42 |     options.addOption(OptionBuilder.withArgName("id").hasArg()
43 |         .withDescription("WARC-TREC-ID").create(DOCID_OPTION));
44 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
45 |         .withDescription("docno").create(DOCNO_OPTION));
46 | 
47 |     CommandLine cmdline = null;
48 |     CommandLineParser parser = new GnuParser();
49 |     try {
50 |       cmdline = parser.parse(options, args);
51 |     } catch (ParseException exp) {
52 |       System.err.println("Error parsing command line: " + exp.getMessage());
53 |       System.exit(-1);
54 |     }
55 | 
56 |     if (!cmdline.hasOption(INDEX_OPTION) || 
57 |         !(cmdline.hasOption(DOCID_OPTION) || cmdline.hasOption(DOCNO_OPTION))) {
58 |       HelpFormatter formatter = new HelpFormatter();
59 |       formatter.printHelp(LookupWarcTrecIdMapping.class.getCanonicalName(), options);
60 |       System.exit(-1);
61 |     }
62 | 
63 |     String indexPath = cmdline.getOptionValue(INDEX_OPTION);
64 | 
65 |     WarcTrecIdMapping mapping = new WarcTrecIdMapping(new Path(indexPath), getConf());
66 |     if (cmdline.hasOption(DOCID_OPTION)) {
67 |       System.out.println(mapping.getDocno(cmdline.getOptionValue(DOCID_OPTION)));
68 |     }
69 | 
70 |     if (cmdline.hasOption(DOCNO_OPTION)) {
71 |       System.out.println(mapping.getDocid(Integer.parseInt(cmdline.getOptionValue(DOCNO_OPTION))));
72 |     }
73 | 
74 |     return 0;
75 |   }
76 | 
77 |   public static void main(String[] args) throws Exception {
78 |     ToolRunner.run(new LookupWarcTrecIdMapping(), args);
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/MergeTermStatistics.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | 
 22 | import org.apache.commons.cli.CommandLine;
 23 | import org.apache.commons.cli.CommandLineParser;
 24 | import org.apache.commons.cli.GnuParser;
 25 | import org.apache.commons.cli.HelpFormatter;
 26 | import org.apache.commons.cli.OptionBuilder;
 27 | import org.apache.commons.cli.Options;
 28 | import org.apache.commons.cli.ParseException;
 29 | import org.apache.hadoop.conf.Configured;
 30 | import org.apache.hadoop.fs.FileSystem;
 31 | import org.apache.hadoop.fs.Path;
 32 | import org.apache.hadoop.io.Text;
 33 | import org.apache.hadoop.mapreduce.Job;
 34 | import org.apache.hadoop.mapreduce.Reducer;
 35 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 36 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 37 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 39 | import org.apache.hadoop.util.Tool;
 40 | import org.apache.hadoop.util.ToolRunner;
 41 | import org.apache.log4j.Logger;
 42 | 
 43 | import tl.lin.data.pair.PairOfIntLong;
 44 | 
 45 | public class MergeTermStatistics extends Configured implements Tool {
 46 |   private static final Logger LOG = Logger.getLogger(MergeTermStatistics.class);
 47 | 
 48 |   private static final String HADOOP_DF_MIN_OPTION = "df.min";
 49 |   private static final String HADOOP_DF_MAX_OPTION = "df.max";
 50 | 
 51 |   private static final int MIN_DF_DEFAULT = 100;        // Throw away terms with df less than this.
 52 | 
 53 |   private static class MyCombiner extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
 54 |     private static final PairOfIntLong output = new PairOfIntLong();
 55 | 
 56 |     @Override
 57 |     public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
 58 |     throws IOException, InterruptedException {
 59 |       int df = 0;
 60 |       long cf = 0;
 61 |       for (PairOfIntLong pair : values) {
 62 |         df += pair.getLeftElement();
 63 |         cf += pair.getRightElement();
 64 |       }
 65 | 
 66 |       output.set(df, cf);
 67 |       context.write(key, output);
 68 |     }
 69 |   }
 70 | 
 71 |   private static class MyReducer extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
 72 |     private static final PairOfIntLong output = new PairOfIntLong();
 73 |     private int dfMin, dfMax;
 74 | 
 75 |     @Override
 76 |     public void setup(Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) {
 77 |       dfMin = context.getConfiguration().getInt(HADOOP_DF_MIN_OPTION, MIN_DF_DEFAULT);
 78 |       dfMax = context.getConfiguration().getInt(HADOOP_DF_MAX_OPTION, Integer.MAX_VALUE);
 79 |       LOG.info("dfMin = " + dfMin);
 80 |     }
 81 | 
 82 |     @Override
 83 |     public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
 84 |     throws IOException, InterruptedException {
 85 |       int df = 0;
 86 |       long cf = 0;
 87 |       for (PairOfIntLong pair : values) {
 88 |         df += pair.getLeftElement();
 89 |         cf += pair.getRightElement();
 90 |       }
 91 |       if (df < dfMin || df > dfMax) {
 92 |         return;
 93 |       }
 94 |       output.set(df, cf);
 95 |       context.write(key, output);
 96 |     }
 97 |   }
 98 | 
 99 |   public static final String INPUT_OPTION = "input";
100 |   public static final String OUTPUT_OPTION = "output";
101 |   public static final String DF_MIN_OPTION = "dfMin";
102 | 
103 |   /**
104 |    * Runs this tool.
105 |    */
106 |   @SuppressWarnings("static-access")
107 |   public int run(String[] args) throws Exception {
108 |     Options options = new Options();
109 | 
110 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
111 |         .withDescription("input path").create(INPUT_OPTION));
112 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
113 |         .withDescription("output path").create(OUTPUT_OPTION));
114 |     options.addOption(OptionBuilder.withArgName("num").hasArg()
115 |         .withDescription("minimum df").create(DF_MIN_OPTION));
116 | 
117 |     CommandLine cmdline;
118 |     CommandLineParser parser = new GnuParser();
119 |     try {
120 |       cmdline = parser.parse(options, args);
121 |     } catch (ParseException exp) {
122 |       HelpFormatter formatter = new HelpFormatter();
123 |       formatter.printHelp(this.getClass().getName(), options);
124 |       ToolRunner.printGenericCommandUsage(System.out);
125 |       System.err.println("Error parsing command line: " + exp.getMessage());
126 |       return -1;
127 |     }
128 | 
129 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
130 |       HelpFormatter formatter = new HelpFormatter();
131 |       formatter.printHelp(this.getClass().getName(), options);
132 |       ToolRunner.printGenericCommandUsage(System.out);
133 |       return -1;
134 |     }
135 | 
136 |     String input = cmdline.getOptionValue(INPUT_OPTION);
137 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
138 | 
139 |     LOG.info("Tool name: " + MergeTermStatistics.class.getSimpleName());
140 |     LOG.info(" - input: " + input);
141 |     LOG.info(" - output: " + output);
142 | 
143 |     Job job = new Job(getConf(), MergeTermStatistics.class.getSimpleName() + ":" + input);
144 |     job.setJarByClass(MergeTermStatistics.class);
145 | 
146 |     job.setNumReduceTasks(100);
147 | 
148 |     if (cmdline.hasOption(DF_MIN_OPTION)) {
149 |       int dfMin = Integer.parseInt(cmdline.getOptionValue(DF_MIN_OPTION));
150 |       LOG.info(" - dfMin: " + dfMin);
151 |       job.getConfiguration().setInt(HADOOP_DF_MIN_OPTION, dfMin);
152 |     }
153 | 
154 |     FileInputFormat.setInputPaths(job, input);
155 |     FileOutputFormat.setOutputPath(job, new Path(output));
156 | 
157 |     job.setInputFormatClass(SequenceFileInputFormat.class);
158 |     job.setOutputFormatClass(SequenceFileOutputFormat.class);
159 | 
160 |     job.setMapOutputKeyClass(Text.class);
161 |     job.setMapOutputValueClass(PairOfIntLong.class);
162 |     job.setOutputKeyClass(Text.class);
163 |     job.setOutputValueClass(PairOfIntLong.class);
164 | 
165 |     job.setCombinerClass(MyCombiner.class);
166 |     job.setReducerClass(MyReducer.class);
167 | 
168 |     FileSystem.get(getConf()).delete(new Path(output), true);
169 | 
170 |     long startTime = System.currentTimeMillis();
171 |     job.waitForCompletion(true);
172 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
173 | 
174 |     return 0;
175 |   }
176 | 
177 |   /**
178 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
179 |    */
180 |   public static void main(String[] args) throws Exception {
181 |     LOG.info("Running " + MergeTermStatistics.class.getCanonicalName() + " with args "
182 |         + Arrays.toString(args));
183 |     ToolRunner.run(new MergeTermStatistics(), args);
184 |   }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/ProcessPForDocVectors.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.Text;
 34 | import org.apache.hadoop.mapreduce.Job;
 35 | import org.apache.hadoop.mapreduce.Mapper;
 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 37 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 40 | import org.apache.hadoop.util.Tool;
 41 | import org.apache.hadoop.util.ToolRunner;
 42 | import org.apache.log4j.Logger;
 43 | import org.clueweb.data.PForDocVector;
 44 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary;
 45 | 
 46 | import tl.lin.data.array.IntArrayWritable;
 47 | 
 48 | import com.google.common.base.Joiner;
 49 | import com.google.common.collect.Lists;
 50 | 
 51 | public class ProcessPForDocVectors extends Configured implements Tool {
 52 |   private static final Logger LOG = Logger.getLogger(ProcessPForDocVectors.class);
 53 | 
 54 |   private static final Joiner JOINER = Joiner.on("|");
 55 | 
 56 |   private static class MyMapper extends Mapper<Text, IntArrayWritable, Text, Text> {
 57 |     private static final PForDocVector DOC = new PForDocVector();
 58 | 
 59 |     private DefaultFrequencySortedDictionary dictionary;
 60 | 
 61 |     @Override
 62 |     public void setup(Context context) throws IOException {
 63 |       FileSystem fs = FileSystem.get(context.getConfiguration());
 64 |       String path = context.getConfiguration().get(DICTIONARY_OPTION);
 65 |       dictionary = new DefaultFrequencySortedDictionary(path, fs);
 66 |     }
 67 | 
 68 |     @Override
 69 |     public void map(Text key, IntArrayWritable ints, Context context)
 70 |         throws IOException, InterruptedException {
 71 |       PForDocVector.fromIntArrayWritable(ints, DOC);
 72 | 
 73 |       List<String> terms = Lists.newArrayList();
 74 |       for (int termid : DOC.getTermIds()) {
 75 |         terms.add(dictionary.getTerm(termid));
 76 |       }
 77 | 
 78 |       context.write(key, new Text(JOINER.join(terms)));
 79 |     }
 80 |   }
 81 | 
 82 |   public static final String INPUT_OPTION = "input";
 83 |   public static final String OUTPUT_OPTION = "output";
 84 |   public static final String DICTIONARY_OPTION = "dictionary";
 85 | 
 86 |   /**
 87 |    * Runs this tool.
 88 |    */
 89 |   @SuppressWarnings("static-access")
 90 |   public int run(String[] args) throws Exception {
 91 |     Options options = new Options();
 92 | 
 93 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 94 |         .withDescription("input path").create(INPUT_OPTION));
 95 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 96 |         .withDescription("output path").create(OUTPUT_OPTION));
 97 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 98 |         .withDescription("dictionary").create(DICTIONARY_OPTION));
 99 | 
100 |     CommandLine cmdline;
101 |     CommandLineParser parser = new GnuParser();
102 |     try {
103 |       cmdline = parser.parse(options, args);
104 |     } catch (ParseException exp) {
105 |       HelpFormatter formatter = new HelpFormatter();
106 |       formatter.printHelp(this.getClass().getName(), options);
107 |       ToolRunner.printGenericCommandUsage(System.out);
108 |       System.err.println("Error parsing command line: " + exp.getMessage());
109 |       return -1;
110 |     }
111 | 
112 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) ||
113 |         !cmdline.hasOption(DICTIONARY_OPTION)) {
114 |       HelpFormatter formatter = new HelpFormatter();
115 |       formatter.printHelp(this.getClass().getName(), options);
116 |       ToolRunner.printGenericCommandUsage(System.out);
117 |       return -1;
118 |     }
119 | 
120 |     String input = cmdline.getOptionValue(INPUT_OPTION);
121 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
122 |     String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
123 | 
124 |     LOG.info("Tool name: " + ProcessPForDocVectors.class.getSimpleName());
125 |     LOG.info(" - input: " + input);
126 |     LOG.info(" - output: " + output);
127 |     LOG.info(" - dictionary: " + dictionary);
128 | 
129 |     Job job = new Job(getConf(), ProcessPForDocVectors.class.getSimpleName() + ":" + input);
130 |     job.setJarByClass(ProcessPForDocVectors.class);
131 | 
132 |     job.setNumReduceTasks(0);
133 | 
134 |     FileInputFormat.setInputPaths(job, input);
135 |     FileOutputFormat.setOutputPath(job, new Path(output));
136 | 
137 |     job.getConfiguration().set(DICTIONARY_OPTION, dictionary);
138 | 
139 |     job.setInputFormatClass(SequenceFileInputFormat.class);
140 |     job.setOutputFormatClass(TextOutputFormat.class);
141 | 
142 |     job.setMapOutputKeyClass(Text.class);
143 |     job.setMapOutputValueClass(Text.class);
144 | 
145 |     job.setMapperClass(MyMapper.class);
146 | 
147 |     FileSystem.get(getConf()).delete(new Path(output), true);
148 | 
149 |     long startTime = System.currentTimeMillis();
150 |     job.waitForCompletion(true);
151 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
152 | 
153 |     return 0;
154 |   }
155 | 
156 |   /**
157 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
158 |    */
159 |   public static void main(String[] args) throws Exception {
160 |     LOG.info("Running " + ProcessPForDocVectors.class.getCanonicalName() + " with args "
161 |         + Arrays.toString(args));
162 |     ToolRunner.run(new ProcessPForDocVectors(), args);
163 |   }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/app/ProcessVByteDocVectors.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.app;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.Arrays;
 21 | import java.util.List;
 22 | 
 23 | import org.apache.commons.cli.CommandLine;
 24 | import org.apache.commons.cli.CommandLineParser;
 25 | import org.apache.commons.cli.GnuParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.hadoop.conf.Configured;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.BytesWritable;
 34 | import org.apache.hadoop.io.Text;
 35 | import org.apache.hadoop.mapreduce.Job;
 36 | import org.apache.hadoop.mapreduce.Mapper;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 39 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 40 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 41 | import org.apache.hadoop.util.Tool;
 42 | import org.apache.hadoop.util.ToolRunner;
 43 | import org.apache.log4j.Logger;
 44 | import org.clueweb.data.VByteDocVector;
 45 | import org.clueweb.dictionary.DefaultFrequencySortedDictionary;
 46 | 
 47 | import com.google.common.base.Joiner;
 48 | import com.google.common.collect.Lists;
 49 | 
 50 | public class ProcessVByteDocVectors extends Configured implements Tool {
 51 |   private static final Logger LOG = Logger.getLogger(ProcessVByteDocVectors.class);
 52 | 
 53 |   private static final Joiner JOINER = Joiner.on("|");
 54 | 
 55 |   private static class MyMapper extends Mapper<Text, BytesWritable, Text, Text> {
 56 |     private static final VByteDocVector DOC = new VByteDocVector();
 57 | 
 58 |     private DefaultFrequencySortedDictionary dictionary;
 59 | 
 60 |     @Override
 61 |     public void setup(Context context) throws IOException {
 62 |       FileSystem fs = FileSystem.get(context.getConfiguration());
 63 |       String path = context.getConfiguration().get(DICTIONARY_OPTION);
 64 |       dictionary = new DefaultFrequencySortedDictionary(path, fs);
 65 |     }
 66 | 
 67 |     @Override
 68 |     public void map(Text key, BytesWritable bytes, Context context)
 69 |         throws IOException, InterruptedException {
 70 |       VByteDocVector.fromBytesWritable(bytes, DOC);
 71 | 
 72 |       List<String> terms = Lists.newArrayList();
 73 |       for (int termid : DOC.getTermIds()) {
 74 |         terms.add(dictionary.getTerm(termid));
 75 |       }
 76 | 
 77 |       context.write(key, new Text(JOINER.join(terms)));
 78 |     }
 79 |   }
 80 | 
 81 |   public static final String INPUT_OPTION = "input";
 82 |   public static final String OUTPUT_OPTION = "output";
 83 |   public static final String DICTIONARY_OPTION = "dictionary";
 84 | 
 85 |   /**
 86 |    * Runs this tool.
 87 |    */
 88 |   @SuppressWarnings("static-access")
 89 |   public int run(String[] args) throws Exception {
 90 |     Options options = new Options();
 91 | 
 92 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 93 |         .withDescription("input path").create(INPUT_OPTION));
 94 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 95 |         .withDescription("output path").create(OUTPUT_OPTION));
 96 |     options.addOption(OptionBuilder.withArgName("path").hasArg()
 97 |         .withDescription("dictionary").create(DICTIONARY_OPTION));
 98 | 
 99 |     CommandLine cmdline;
100 |     CommandLineParser parser = new GnuParser();
101 |     try {
102 |       cmdline = parser.parse(options, args);
103 |     } catch (ParseException exp) {
104 |       HelpFormatter formatter = new HelpFormatter();
105 |       formatter.printHelp(this.getClass().getName(), options);
106 |       ToolRunner.printGenericCommandUsage(System.out);
107 |       System.err.println("Error parsing command line: " + exp.getMessage());
108 |       return -1;
109 |     }
110 | 
111 |     if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) ||
112 |         !cmdline.hasOption(DICTIONARY_OPTION)) {
113 |       HelpFormatter formatter = new HelpFormatter();
114 |       formatter.printHelp(this.getClass().getName(), options);
115 |       ToolRunner.printGenericCommandUsage(System.out);
116 |       return -1;
117 |     }
118 | 
119 |     String input = cmdline.getOptionValue(INPUT_OPTION);
120 |     String output = cmdline.getOptionValue(OUTPUT_OPTION);
121 |     String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
122 | 
123 |     LOG.info("Tool name: " + ProcessVByteDocVectors.class.getSimpleName());
124 |     LOG.info(" - input: " + input);
125 |     LOG.info(" - output: " + output);
126 |     LOG.info(" - dictionary: " + dictionary);
127 | 
128 |     Job job = new Job(getConf(), ProcessVByteDocVectors.class.getSimpleName() + ":" + input);
129 |     job.setJarByClass(ProcessVByteDocVectors.class);
130 | 
131 |     job.setNumReduceTasks(0);
132 | 
133 |     FileInputFormat.setInputPaths(job, input);
134 |     FileOutputFormat.setOutputPath(job, new Path(output));
135 | 
136 |     job.getConfiguration().set(DICTIONARY_OPTION, dictionary);
137 | 
138 |     job.setInputFormatClass(SequenceFileInputFormat.class);
139 |     job.setOutputFormatClass(TextOutputFormat.class);
140 | 
141 |     job.setMapOutputKeyClass(Text.class);
142 |     job.setMapOutputValueClass(Text.class);
143 | 
144 |     job.setMapperClass(MyMapper.class);
145 | 
146 |     FileSystem.get(getConf()).delete(new Path(output), true);
147 | 
148 |     long startTime = System.currentTimeMillis();
149 |     job.waitForCompletion(true);
150 |     LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
151 | 
152 |     return 0;
153 |   }
154 | 
155 |   /**
156 |    * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
157 |    */
158 |   public static void main(String[] args) throws Exception {
159 |     LOG.info("Running " + ProcessVByteDocVectors.class.getCanonicalName() + " with args "
160 |         + Arrays.toString(args));
161 |     ToolRunner.run(new ProcessVByteDocVectors(), args);
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/mapred/ClueWeb12InputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | /*
 18 |  * Hadoop FileInputFormat for reading WARC files
 19 |  *
 20 |  * (C) 2009 - Carnegie Mellon University
 21 |  *
 22 |  * 1. Redistributions of this source code must retain the above copyright
 23 |  *    notice, this list of conditions and the following disclaimer.
 24 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",
 25 |  *    "Carnegie Mellon", and "lemurproject" must not be used to
 26 |  *    endorse or promote products derived from this software without
 27 |  *    prior written permission. To obtain permission, contact
 28 |  *    license@lemurproject.org.
 29 |  *
 30 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
 31 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
 32 |  *    permission of The Lemur Project. To obtain permission,
 33 |  *    contact license@lemurproject.org.
 34 |  *
 35 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 36 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 37 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 38 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 39 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 40 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 41 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 42 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 43 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 44 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 45 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 46 |  * POSSIBILITY OF SUCH DAMAGE.
 47 |  *
 48 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 49 |  */
 50 | 
 51 | package org.clueweb.clueweb12.mapred;
 52 | 
 53 | import java.io.DataInputStream;
 54 | import java.io.IOException;
 55 | 
 56 | import org.apache.hadoop.conf.Configuration;
 57 | import org.apache.hadoop.fs.FileSystem;
 58 | import org.apache.hadoop.fs.Path;
 59 | import org.apache.hadoop.io.LongWritable;
 60 | import org.apache.hadoop.io.compress.CompressionCodec;
 61 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
 62 | import org.apache.hadoop.mapred.FileInputFormat;
 63 | import org.apache.hadoop.mapred.FileSplit;
 64 | import org.apache.hadoop.mapred.InputSplit;
 65 | import org.apache.hadoop.mapred.JobConf;
 66 | import org.apache.hadoop.mapred.RecordReader;
 67 | import org.apache.hadoop.mapred.Reporter;
 68 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 69 | 
 70 | public class ClueWeb12InputFormat extends FileInputFormat<LongWritable, ClueWeb12WarcRecord> {
 71 | 
 72 |   /**
 73 |    * Don't allow the files to be split!
 74 |    */
 75 |   @Override
 76 |   protected boolean isSplitable(FileSystem fs, Path filename) {
 77 |     // ensure the input files are not splittable!
 78 |     return false;
 79 |   }
 80 | 
 81 |   /**
 82 |    * Just return the record reader
 83 |    */
 84 |   public RecordReader<LongWritable, ClueWeb12WarcRecord> getRecordReader(InputSplit split, JobConf conf,
 85 |       Reporter reporter) throws IOException {
 86 |     return new ClueWarcRecordReader(conf, (FileSplit) split);
 87 |   }
 88 | 
 89 |   public static class ClueWarcRecordReader implements RecordReader<LongWritable, ClueWeb12WarcRecord> {
 90 |     private long recordCount = 1;
 91 |     private Path path = null;
 92 |     private DataInputStream input = null;
 93 | 
 94 |     private long totalNumBytesRead = 0;
 95 | 
 96 |     public ClueWarcRecordReader(Configuration conf, FileSplit split) throws IOException {
 97 |       FileSystem fs = FileSystem.get(conf);
 98 |       path = split.getPath();
 99 | 
100 |       CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
101 |       CompressionCodec compressionCodec = compressionCodecs.getCodec(path);
102 |       input = new DataInputStream(compressionCodec.createInputStream(fs.open(path)));
103 |     }
104 | 
105 |     @Override
106 |     public boolean next(LongWritable key, ClueWeb12WarcRecord value) throws IOException {
107 |       DataInputStream whichStream = input;
108 | 
109 |       ClueWeb12WarcRecord newRecord = ClueWeb12WarcRecord.readNextWarcRecord(whichStream);
110 |       if (newRecord == null) {
111 |         return false;
112 |       }
113 | 
114 |       totalNumBytesRead += (long) newRecord.getTotalRecordLength();
115 |       newRecord.setWarcFilePath(path.toString());
116 | 
117 |       value.set(newRecord);
118 |       key.set(recordCount);
119 | 
120 |       recordCount++;
121 |       return true;
122 |     }
123 | 
124 |     @Override
125 |     public LongWritable createKey() {
126 |       return new LongWritable();
127 |     }
128 | 
129 |     @Override
130 |     public ClueWeb12WarcRecord createValue() {
131 |       return new ClueWeb12WarcRecord();
132 |     }
133 | 
134 |     @Override
135 |     public long getPos() throws IOException {
136 |       return totalNumBytesRead;
137 |     }
138 | 
139 |     @Override
140 |     public void close() throws IOException {
141 |       input.close();
142 |     }
143 | 
144 |     @Override
145 |     public float getProgress() throws IOException {
146 |       return (float) recordCount / 40000f;
147 |     }
148 |   }
149 | }
150 | 
151 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/clueweb12/mapreduce/ClueWeb12InputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.clueweb12.mapreduce;
 18 | 
 19 | import java.io.DataInputStream;
 20 | import java.io.IOException;
 21 | 
 22 | import org.apache.hadoop.conf.Configuration;
 23 | import org.apache.hadoop.fs.FSDataInputStream;
 24 | import org.apache.hadoop.fs.FileSystem;
 25 | import org.apache.hadoop.fs.Path;
 26 | import org.apache.hadoop.fs.Seekable;
 27 | import org.apache.hadoop.io.LongWritable;
 28 | import org.apache.hadoop.io.compress.CodecPool;
 29 | import org.apache.hadoop.io.compress.CompressionCodec;
 30 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
 31 | import org.apache.hadoop.io.compress.Decompressor;
 32 | import org.apache.hadoop.mapreduce.InputSplit;
 33 | import org.apache.hadoop.mapreduce.JobContext;
 34 | import org.apache.hadoop.mapreduce.RecordReader;
 35 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 36 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 38 | import org.clueweb.clueweb12.ClueWeb12WarcRecord;
 39 | 
 40 | public class ClueWeb12InputFormat extends FileInputFormat<LongWritable, ClueWeb12WarcRecord> {
 41 |   @Override
 42 |   public RecordReader<LongWritable, ClueWeb12WarcRecord> createRecordReader(InputSplit split,
 43 |       TaskAttemptContext context) throws IOException, InterruptedException {
 44 |     return new ClueWarcRecordReader();
 45 |   }
 46 | 
 47 |   @Override
 48 |   protected boolean isSplitable(JobContext context, Path filename) {
 49 |     return false;
 50 |   }
 51 |   
 52 |   public class ClueWarcRecordReader extends RecordReader<LongWritable, ClueWeb12WarcRecord> {
 53 |     private CompressionCodecFactory compressionCodecs = null;
 54 |     private long start;
 55 |     private long pos;
 56 |     private long end;
 57 |     private LongWritable key = null;
 58 |     private ClueWeb12WarcRecord value = null;
 59 |     private Seekable filePosition;
 60 |     private CompressionCodec codec;
 61 |     private Decompressor decompressor;
 62 |     private DataInputStream in;
 63 | 
 64 |     public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 65 |       FileSplit split = (FileSplit) genericSplit;
 66 |       Configuration job = context.getConfiguration();
 67 |       start = split.getStart();
 68 |       end = start + split.getLength();
 69 |       final Path file = split.getPath();
 70 |       compressionCodecs = new CompressionCodecFactory(job);
 71 |       codec = compressionCodecs.getCodec(file);
 72 | 
 73 |       // open the file and seek to the start of the split
 74 |       FileSystem fs = file.getFileSystem(job);
 75 |       FSDataInputStream fileIn = fs.open(split.getPath());
 76 | 
 77 |       if (isCompressedInput()) {
 78 |         in = new DataInputStream(codec.createInputStream(fileIn, decompressor));
 79 |         filePosition = fileIn;
 80 |       } else {
 81 |         fileIn.seek(start);
 82 |         in = fileIn;
 83 |         filePosition = fileIn;
 84 |       }
 85 | 
 86 |       this.pos = start;
 87 |     }
 88 | 
 89 |     private boolean isCompressedInput() {
 90 |       return (codec != null);
 91 |     }
 92 | 
 93 |     private long getFilePosition() throws IOException {
 94 |       long retVal;
 95 |       if (isCompressedInput() && null != filePosition) {
 96 |         retVal = filePosition.getPos();
 97 |       } else {
 98 |         retVal = pos;
 99 |       }
100 |       return retVal;
101 |     }
102 | 
103 |     public boolean nextKeyValue() throws IOException {
104 |       if (key == null) {
105 |         key = new LongWritable();
106 |       }
107 |       key.set(pos);
108 | 
109 |       value = ClueWeb12WarcRecord.readNextWarcRecord(in);
110 |       if (value == null) {
111 |         return false;
112 |       }
113 |       return true;
114 |     }
115 | 
116 |     @Override
117 |     public LongWritable getCurrentKey() {
118 |       return key;
119 |     }
120 | 
121 |     @Override
122 |     public ClueWeb12WarcRecord getCurrentValue() {
123 |       return value;
124 |     }
125 | 
126 |     /**
127 |      * Get the progress within the split
128 |      */
129 |     public float getProgress() throws IOException {
130 |       if (start == end) {
131 |         return 0.0f;
132 |       } else {
133 |         return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start));
134 |       }
135 |     }
136 | 
137 |     public synchronized void close() throws IOException {
138 |       try {
139 |         if (in != null) {
140 |           in.close();
141 |         }
142 |       } finally {
143 |         if (decompressor != null) {
144 |           CodecPool.returnDecompressor(decompressor);
145 |         }
146 |       }
147 |     }
148 |   }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/DocVector.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.data;
18 | 
19 | public interface DocVector {
20 |   int[] getTermIds();
21 |   int getLength();
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/Indexable.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.data;
18 | 
19 | import org.apache.hadoop.io.Writable;
20 | 
21 | /**
22 |  * A document that can be indexed.
23 |  */
24 | public abstract class Indexable implements Writable {
25 | 
26 |   /**
27 |    * Returns the globally-unique String identifier of the document within the collection.
28 |    *
29 |    * @return docid of the document
30 |    */
31 |   public abstract String getDocid();
32 | 
33 |   /**
34 |    * Returns the content of the document.
35 |    *
36 |    * @return content of the document
37 |    */
38 |   public abstract String getContent();
39 | 
40 |   /**
41 |    * Returns the content of the document for display to a human.
42 |    *
43 |    * @return displayable content
44 |    */
45 |   public String getDisplayContent() {
46 |     return getContent();
47 |   }
48 | 
49 |   /**
50 |    * Returns the type of the display content, per IANA MIME Media Type (e.g., "text/html").
51 |    * See {@code http://www.iana.org/assignments/media-types/index.html}
52 |    *
53 |    * @return IANA MIME Media Type
54 |    */
55 |   public String getDisplayContentType() {
56 |     return "text/plain";
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/PForDocVector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.data;
 18 | 
 19 | import me.lemire.integercompression.FastPFOR;
 20 | import me.lemire.integercompression.IntWrapper;
 21 | import me.lemire.integercompression.VariableByte;
 22 | import tl.lin.data.array.IntArrayWritable;
 23 | 
 24 | public class PForDocVector {
 25 |   private static final FastPFOR P4 = new FastPFOR();
 26 |   private static final VariableByte VB = new VariableByte();
 27 | 
 28 |   private int[] termids;
 29 | 
 30 |   public PForDocVector() {}
 31 | 
 32 |   public int[] getTermIds() {
 33 |     return termids;
 34 |   }
 35 | 
 36 |   public int getLength() {
 37 |     return termids.length;
 38 |   }
 39 | 
 40 |   public static void fromIntArrayWritable(IntArrayWritable in, PForDocVector doc) {
 41 |     try {
 42 |       int[] compressed = in.getArray();
 43 |       IntWrapper inPos = new IntWrapper(1);
 44 |       IntWrapper outPos = new IntWrapper(0);
 45 |       doc.termids = new int[compressed[0]];
 46 | 
 47 |       if (doc.termids.length == 0) {
 48 |         return;
 49 |       }
 50 | 
 51 |       if (doc.termids.length < 128) {
 52 |         VB.uncompress(compressed, inPos, in.size()-1, doc.termids, outPos);
 53 |         return;
 54 |       }
 55 | 
 56 |       // For this, the zero doesn't matter.
 57 |       P4.uncompress(compressed, inPos, 0, doc.termids, outPos);
 58 | 
 59 |       if (doc.termids.length % 128 == 0) {
 60 |         return;
 61 |       }
 62 | 
 63 |       // Decode whatever is left over.
 64 |       VB.uncompress(compressed, inPos, in.size() - inPos.get(), doc.termids, outPos);
 65 |     } catch (Exception e) {
 66 |       e.printStackTrace();
 67 |       doc.termids = new int[0];
 68 |     }
 69 |   }
 70 | 
 71 |   public static void toIntArrayWritable(IntArrayWritable ints, int[] termids, int length) {
 72 |     // Remember, the number of terms to serialize is length; the array might be longer.
 73 |     try {
 74 |       if (termids == null) {
 75 |         termids = new int[] {};
 76 |         length = 0;
 77 |       }
 78 | 
 79 |       IntWrapper inPos = new IntWrapper(0);
 80 |       IntWrapper outPos = new IntWrapper(1);
 81 | 
 82 |       int[] out = new int[length + 1];
 83 |       out[0] = length;
 84 | 
 85 |       if (length < 128) {
 86 |         VB.compress(termids, inPos, length, out, outPos);
 87 |         ints.setArray(out, outPos.get());
 88 | 
 89 |         return;
 90 |       }
 91 | 
 92 |       P4.compress(termids, inPos, (length/128)*128, out, outPos);
 93 | 
 94 |       if (length % 128 == 0) {
 95 |         ints.setArray(out, outPos.get());
 96 |         return;
 97 |       }
 98 | 
 99 |       VB.compress(termids, inPos, length % 128, out, outPos);
100 |       ints.setArray(out, outPos.get());
101 |     } catch (Exception e) {
102 |       e.printStackTrace();
103 |       ints.setArray(new int[] {}, 0);
104 |     }
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/TermStatistics.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.data;
 18 | 
 19 | import java.io.IOException;
 20 | 
 21 | import org.apache.hadoop.conf.Configuration;
 22 | import org.apache.hadoop.fs.FSDataInputStream;
 23 | import org.apache.hadoop.fs.FileSystem;
 24 | import org.apache.hadoop.fs.Path;
 25 | import org.apache.hadoop.io.WritableUtils;
 26 | import org.clueweb.clueweb12.app.BuildDictionary;
 27 | 
 28 | import com.google.common.base.Preconditions;
 29 | 
 30 | public class TermStatistics {
 31 |   private final int numTerms;
 32 |   private final long[] cfs;
 33 |   private final int[] dfs;
 34 | 
 35 |   private long collectionSize;
 36 | 
 37 |   private long maxCf = 0;
 38 |   private int maxCfTerm;
 39 | 
 40 |   private int maxDf = 0;
 41 |   private int maxDfTerm;
 42 | 
 43 |   /**
 44 |    * Creates a {@code CfTable} object.
 45 |    *
 46 |    * @param file collection frequency data file
 47 |    * @throws IOException
 48 |    */
 49 |   public TermStatistics(Path file) throws IOException {
 50 |     this(file, FileSystem.get(new Configuration()));
 51 |   }
 52 | 
 53 |   /**
 54 |    * Creates a {@code CfTable} object.
 55 |    *
 56 |    * @param file collection frequency data file
 57 |    * @param fs FileSystem to read from
 58 |    * @throws IOException
 59 |    */
 60 |   public TermStatistics(Path file, FileSystem fs) throws IOException {
 61 |     Preconditions.checkNotNull(file);
 62 |     Preconditions.checkNotNull(fs);
 63 | 
 64 |     FSDataInputStream in = fs.open(new Path(file, BuildDictionary.CF_BY_ID_DATA));
 65 |     this.numTerms = in.readInt();
 66 | 
 67 |     cfs = new long[numTerms];
 68 | 
 69 |     for (int i = 0; i < numTerms; i++) {
 70 |       long cf = WritableUtils.readVLong(in);
 71 | 
 72 |       cfs[i] = cf;
 73 |       collectionSize += cf;
 74 | 
 75 |       if (cf > maxCf) {
 76 |         maxCf = cf;
 77 |         maxCfTerm = i + 1;
 78 |       }
 79 |     }
 80 | 
 81 |     in.close();
 82 | 
 83 |     in = fs.open(new Path(file, BuildDictionary.DF_BY_ID_DATA));
 84 |     if (numTerms != in.readInt() ) {
 85 |       throw new IOException("df data and cf data should have the same number of entries!");
 86 |     }
 87 | 
 88 |     dfs = new int[numTerms];
 89 | 
 90 |     for (int i = 0; i < numTerms; i++) {
 91 |       int df = WritableUtils.readVInt(in);
 92 | 
 93 |       dfs[i] = df;
 94 | 
 95 |       if (df > maxDf) {
 96 |         maxDf = df;
 97 |         maxDfTerm = i + 1;
 98 |       }
 99 |     }
100 | 
101 |     in.close();
102 |   }
103 | 
104 |   public int getDf(int term) {
105 |     if (term <= 0 || term > numTerms) {
106 |       return 0;
107 |     }
108 |     return dfs[term - 1];
109 |   }
110 | 
111 |   public long getCf(int term) {
112 |     if (term <= 0 || term > numTerms) {
113 |       return 0;
114 |     }
115 | 
116 |     return cfs[term - 1];
117 |   }
118 | 
119 |   public long getCollectionSize() {
120 |     return collectionSize;
121 |   }
122 | 
123 |   public int getVocabularySize() {
124 |     return numTerms;
125 |   }
126 | 
127 |   public int getMaxDf() {
128 |     return maxDf;
129 |   }
130 | 
131 |   public long getMaxCf() {
132 |     return maxCf;
133 |   }
134 | 
135 |   public int getMaxDfTerm() {
136 |     return maxDfTerm;
137 |   }
138 | 
139 |   public int getMaxCfTerm() {
140 |     return maxCfTerm;
141 |   }
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/VByteDocVector.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.data;
18 | 
19 | import java.io.ByteArrayInputStream;
20 | import java.io.ByteArrayOutputStream;
21 | import java.io.DataInputStream;
22 | import java.io.DataOutputStream;
23 | import java.io.IOException;
24 | 
25 | import org.apache.hadoop.io.BytesWritable;
26 | import org.apache.hadoop.io.WritableUtils;
27 | 
28 | public class VByteDocVector implements DocVector {
29 |   private int[] termids;
30 | 
31 |   public VByteDocVector() {}
32 | 
33 |   public int[] getTermIds() {
34 |     return termids;
35 |   }
36 | 
37 |   public int getLength() {
38 |     return termids.length;
39 |   }
40 | 
41 |   public static void fromBytesWritable(BytesWritable bytes, VByteDocVector doc) {
42 |     try {
43 |       ByteArrayInputStream bytesIn = new ByteArrayInputStream(bytes.getBytes());
44 |       DataInputStream data = new DataInputStream(bytesIn);
45 | 
46 |       int length = WritableUtils.readVInt(data);
47 |       doc.termids = new int[length];
48 |       for (int i = 0; i < length; i++) {
49 |         doc.termids[i] = WritableUtils.readVInt(data);
50 |       }
51 |     } catch (IOException e) {
52 |       doc.termids = new int[0];
53 |     }
54 |   }
55 | 
56 |   public static void toBytesWritable(BytesWritable bytes, int[] termids, int length) {
57 |     try {
58 |       if (termids == null) {
59 |         termids = new int[] {};
60 |         length = 0;
61 |       }
62 | 
63 |       ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
64 |       DataOutputStream dataOut = new DataOutputStream(bytesOut);
65 | 
66 |       WritableUtils.writeVInt(dataOut, length);
67 |       for (int i = 0; i < length; i++) {
68 |         WritableUtils.writeVInt(dataOut, termids[i]);
69 |       }
70 | 
71 |       byte[] raw = bytesOut.toByteArray();
72 |       bytes.set(raw, 0, raw.length);
73 |     } catch (IOException e) {
74 |       bytes.set(new byte[] {}, 0, 0);
75 |     }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/data/WarcTrecIdMapping.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.data;
18 | 
19 | import java.io.IOException;
20 | 
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.apache.hadoop.fs.FileSystem;
23 | import org.apache.hadoop.fs.Path;
24 | import org.apache.log4j.Logger;
25 | import org.apache.lucene.document.Document;
26 | import org.apache.lucene.index.DirectoryReader;
27 | import org.apache.lucene.index.IndexReader;
28 | import org.apache.lucene.index.Term;
29 | import org.apache.lucene.search.IndexSearcher;
30 | import org.apache.lucene.search.Query;
31 | import org.apache.lucene.search.TermQuery;
32 | import org.apache.lucene.search.TopDocs;
33 | import org.apache.lucene.store.Directory;
34 | 
35 | import tl.lin.lucene.FileSystemDirectory;
36 | 
37 | public class WarcTrecIdMapping {
38 |   private static final Logger LOG = Logger.getLogger(WarcTrecIdMapping.class);
39 | 
40 |   public static enum IndexField {
41 |     WARC_TREC_ID("WARC-TREC-ID");
42 | 
43 |     public final String name;
44 | 
45 |     IndexField(String s) {
46 |       name = s;
47 |     }
48 |   };
49 | 
50 |   private IndexReader reader;
51 |   private IndexSearcher searcher;
52 | 
53 |   public WarcTrecIdMapping(Path indexLocation, Configuration conf) throws IOException {
54 |     FileSystem fs = FileSystem.getLocal(conf);
55 |     Directory directory = new FileSystemDirectory(fs, indexLocation, false, conf);
56 | 
57 |     LOG.info("Opening index " + indexLocation);
58 |     reader = DirectoryReader.open(directory);
59 |     searcher = new IndexSearcher(reader);
60 |   }
61 | 
62 |   public int getDocno(String id) {
63 |     Query query = new TermQuery(new Term(IndexField.WARC_TREC_ID.name, id));
64 | 
65 |     TopDocs rs;
66 |     try {
67 |       rs = searcher.search(query, 1);
68 |       if (rs.totalHits != 1) {
69 |         return -1;
70 |       }
71 | 
72 |       return rs.scoreDocs[0].doc;
73 |     } catch (IOException e) {
74 |       e.printStackTrace();
75 |     }
76 | 
77 |     return -1;
78 |   }
79 | 
80 |   public String getDocid(int docno) {
81 |     if (docno >= reader.maxDoc()) {
82 |       return null;
83 |     }
84 |     try {
85 |       Document d = reader.document(docno);
86 |       if (d == null) {
87 |         return null;
88 |       }
89 |       return d.getField(IndexField.WARC_TREC_ID.name).stringValue();
90 |     } catch (IOException e) {
91 |       e.printStackTrace();
92 |     }
93 |     return null;
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/DefaultFrequencySortedDictionary.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.dictionary;
 18 | 
 19 | import java.io.BufferedReader;
 20 | import java.io.IOException;
 21 | import java.io.InputStreamReader;
 22 | import java.io.PrintStream;
 23 | import java.util.Iterator;
 24 | 
 25 | import org.apache.hadoop.conf.Configuration;
 26 | import org.apache.hadoop.fs.FSDataInputStream;
 27 | import org.apache.hadoop.fs.FileSystem;
 28 | import org.apache.hadoop.fs.Path;
 29 | import org.clueweb.clueweb12.app.BuildDictionary;
 30 | import org.clueweb.data.TermStatistics;
 31 | 
 32 | /**
 33 |  * An implementation of {@link FrequencySortedDictionary}. Term ids start at 1, which corresponds to
 34 |  * the most frequent term. Term id 2 is the second most frequent term, etc.
 35 |  *
 36 |  * @author Jimmy Lin
 37 |  */
 38 | public class DefaultFrequencySortedDictionary implements FrequencySortedDictionary {
 39 |   private FrontCodedDictionary dictionary = new FrontCodedDictionary();
 40 |   private int[] ids;
 41 |   private int[] idsToTerm;
 42 | 
 43 |   /**
 44 |    * Constructs an instance of this dictionary from serialized data files.
 45 |    */
 46 |   public DefaultFrequencySortedDictionary(String basePath, FileSystem fs) throws IOException {
 47 |     FSDataInputStream in;
 48 | 
 49 |     in = fs.open(new Path(basePath, BuildDictionary.TERMS_DATA));
 50 |     dictionary.readFields(in);
 51 |     in.close();
 52 | 
 53 |     int l = 0;
 54 | 
 55 |     in = fs.open(new Path(basePath, BuildDictionary.TERMS_ID_DATA));
 56 |     l = in.readInt();
 57 |     ids = new int[l];
 58 |     for (int i = 0; i < l; i++) {
 59 |       ids[i] = in.readInt();
 60 |     }
 61 |     in.close();
 62 | 
 63 |     in = fs.open(new Path(basePath, BuildDictionary.TERMS_ID_MAPPING_DATA));
 64 |     l = in.readInt();
 65 |     idsToTerm = new int[l];
 66 |     for (int i = 0; i < l; i++) {
 67 |       idsToTerm[i] = in.readInt();
 68 |     }
 69 |     in.close();
 70 |   }
 71 | 
 72 |   @Override
 73 |   public int size() {
 74 |     return ids.length;
 75 |   }
 76 | 
 77 |   @Override
 78 |   public int getId(String term) {
 79 |     int index = dictionary.getId(term);
 80 | 
 81 |     if (index < 0) {
 82 |       return -1;
 83 |     }
 84 | 
 85 |     return ids[index];
 86 |   }
 87 | 
 88 |   @Override
 89 |   public String getTerm(int id) {
 90 |     if (id > ids.length || id == 0 || idsToTerm == null) {
 91 |       return null;
 92 |     }
 93 |     String term = dictionary.getTerm(idsToTerm[id - 1]);
 94 | 
 95 |     return term;
 96 |   }
 97 | 
 98 |   /**
 99 |    * Returns an iterator over the dictionary in order of term id.
100 |    */
101 |   @Override
102 |   public Iterator<String> iterator() {
103 |     return new Iterator<String>() {
104 |       private int cur = 1;
105 |       final private int end = dictionary.size();
106 | 
107 |       @Override
108 |       public boolean hasNext() {
109 |         return cur < end + 1;
110 |       }
111 | 
112 |       @Override
113 |       public String next() {
114 |         return getTerm(cur++);
115 |       }
116 | 
117 |       @Override
118 |       public void remove() {
119 |         throw new UnsupportedOperationException();
120 |       }
121 |     };
122 |   }
123 | 
124 |   /**
125 |    * Simple demo program for looking up terms and term ids.
126 |    */
127 |   public static void main(String[] args) throws Exception {
128 |     if (args.length != 1) {
129 |       System.err.println("usage: [index-path]");
130 |       System.exit(-1);
131 |     }
132 | 
133 |     String path = args[0];
134 | 
135 |     PrintStream out = new PrintStream(System.out, true, "UTF-8");
136 | 
137 |     Configuration conf = new Configuration();
138 |     FileSystem fs = FileSystem.get(conf);
139 | 
140 |     DefaultFrequencySortedDictionary dictionary =
141 |         new DefaultFrequencySortedDictionary(path, fs);
142 | 
143 |     int nTerms = dictionary.size();
144 |     out.println("number of terms: " + nTerms);
145 | 
146 |     TermStatistics stats = new TermStatistics(new Path(path), fs);
147 |     out.println("max df = " + stats.getMaxDf() + ", termid " + stats.getMaxDfTerm());
148 |     out.println("max cf = " + stats.getMaxCf() + ", termid " + stats.getMaxCfTerm());
149 |     out.println("collection size = " + stats.getCollectionSize());
150 |     out.println("");
151 | 
152 |     out.println(" \"term word\" to lookup termid; \"termid 234\" to lookup term");
153 |     String cmd = null;
154 |     BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
155 |     out.print("lookup > ");
156 |     while ((cmd = stdin.readLine()) != null) {
157 | 
158 |       String[] tokens = cmd.split("\\s+");
159 | 
160 |       if (tokens.length != 2) {
161 |         out.println("Error: unrecognized command!");
162 |         out.print("lookup > ");
163 | 
164 |         continue;
165 |       }
166 | 
167 |       if (tokens[0].equals("termid")) {
168 |         int termid;
169 |         try {
170 |           termid = Integer.parseInt(tokens[1]);
171 |         } catch (Exception e) {
172 |           out.println("Error: invalid termid!");
173 |           out.print("lookup > ");
174 | 
175 |           continue;
176 |         }
177 | 
178 |         out.println("termid=" + termid + ", term=" + dictionary.getTerm(termid));
179 |         out.println("  df = " + stats.getDf(termid) + ", cf = " + stats.getCf(termid));
180 |       } else if (tokens[0].equals("term")) {
181 |         String term = tokens[1];
182 | 
183 |         out.println("term=" + term + ", termid=" + dictionary.getId(term));
184 |         out.println("  df = " + stats.getDf(dictionary.getId(term)) +
185 |             ", cf = " + stats.getCf(dictionary.getId(term)));
186 |       } else {
187 |         out.println("Error: unrecognized command!");
188 |         out.print("lookup > ");
189 |         continue;
190 |       }
191 | 
192 |       out.print("lookup > ");
193 |     }
194 |     out.close();
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/Dictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.dictionary;
18 | 
19 | /**
20 |  * A dictionary provides a bidirectional mapping terms (Strings) and term ids (integers). The
21 |  * semantics of the mapping is left unspecified, but the iteration order is always in
22 |  * increasing term id. 
23 |  *
24 |  * @author Jimmy Lin
25 |  */
26 | public interface Dictionary extends Iterable<String> {
27 |   /**
28 |    * Returns the term associated with this term id.
29 |    *
30 |    * @param id term id
31 |    * @return term associated with this term id
32 |    */
33 |   String getTerm(int id);
34 | 
35 |   /**
36 |    * Returns the id associated with this term.
37 |    *
38 |    * @param term term
39 |    * @return id associated with this term
40 |    */
41 |   int getId(String term);
42 | 
43 |   /**
44 |    * Returns the size of this dictionary.
45 |    *
46 |    * @return number of terms in this dictionary
47 |    */
48 |   int size();
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/DictionaryTransformationStrategy.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.dictionary;
18 | 
19 | import it.unimi.dsi.bits.TransformationStrategies;
20 | import it.unimi.dsi.bits.TransformationStrategy;
21 | 
22 | import java.nio.charset.CharacterCodingException;
23 | 
24 | import org.apache.hadoop.io.Text;
25 | import org.apache.hadoop.io.WritableUtils;
26 | 
27 | public class DictionaryTransformationStrategy {
28 |   public static TransformationStrategy<CharSequence> getStrategy() {
29 |     return TransformationStrategies.prefixFreeUtf16();
30 |   }
31 | 
32 |   public static class WritableComparator extends org.apache.hadoop.io.WritableComparator {
33 |     private final TransformationStrategy<CharSequence> strategy =
34 |         DictionaryTransformationStrategy.getStrategy();
35 | 
36 |     public WritableComparator() {
37 |       super(Text.class);
38 |     }
39 | 
40 |     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
41 |       int n1 = WritableUtils.decodeVIntSize(b1[s1]);
42 |       int n2 = WritableUtils.decodeVIntSize(b2[s2]);
43 | 
44 |       String t1=null, t2=null;
45 |       try {
46 |         t1 = Text.decode(b1, s1+n1, l1-n1);
47 |         t2 = Text.decode(b2, s2+n2, l2-n2);
48 |       } catch (CharacterCodingException e) {
49 |         throw new RuntimeException(e);
50 |       }
51 | 
52 |       return strategy.toBitVector(t1).compareTo(strategy.toBitVector(t2));
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/FrequencySortedDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.dictionary;
18 | 
19 | /**
20 |  * A frequency-sorted dictionary. That is, smaller term ids are assigned to more
21 |  * frequently occurring terms.
22 |  *
23 |  * @author Jimmy Lin
24 |  */
25 | public interface FrequencySortedDictionary extends Dictionary {}
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/FrontCodedDictionary.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.dictionary;
 18 | 
 19 | import it.unimi.dsi.util.FrontCodedStringList;
 20 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
 21 | 
 22 | import java.io.BufferedReader;
 23 | import java.io.ByteArrayInputStream;
 24 | import java.io.DataInput;
 25 | import java.io.DataOutput;
 26 | import java.io.IOException;
 27 | import java.io.InputStreamReader;
 28 | import java.io.ObjectInputStream;
 29 | import java.util.Iterator;
 30 | 
 31 | import org.apache.hadoop.conf.Configuration;
 32 | import org.apache.hadoop.fs.FileSystem;
 33 | import org.apache.hadoop.fs.Path;
 34 | import org.apache.hadoop.io.Writable;
 35 | import org.apache.log4j.Logger;
 36 | 
 37 | public class FrontCodedDictionary implements Writable, LexicographicallySortedDictionary {
 38 |   private static final Logger LOG = Logger.getLogger(FrontCodedDictionary.class);
 39 | 
 40 |   private FrontCodedStringList stringList;
 41 |   private ShiftAddXorSignedStringMap dictionary;
 42 | 
 43 |   public FrontCodedDictionary() {}
 44 | 
 45 |   @Override
 46 |   public int getId(String term) {
 47 |     return (int) dictionary.getLong(term);
 48 |   }
 49 | 
 50 |   @Override
 51 |   public String getTerm(int id) {
 52 |     return stringList.get(id).toString();
 53 |   }
 54 | 
 55 |   @Override
 56 |   public int size() {
 57 |     return stringList.size();
 58 |   }
 59 | 
 60 |   @Override
 61 |   public Iterator<String> iterator() {
 62 |     return null;
 63 |   }
 64 | 
 65 |   @Override
 66 |   public void readFields(final DataInput in) throws IOException {
 67 |     byte[] bytes;
 68 |     ObjectInputStream obj;
 69 | 
 70 |     bytes = new byte[in.readInt()];
 71 |     LOG.info("Loading front-coded list of terms: " + bytes.length + " bytes.");
 72 |     in.readFully(bytes);
 73 |     obj = new ObjectInputStream(new ByteArrayInputStream(bytes));
 74 |     try {
 75 |       stringList = (FrontCodedStringList) obj.readObject();
 76 |     } catch (ClassNotFoundException e) {
 77 |       throw new RuntimeException(e);
 78 |     }
 79 |     obj.close();
 80 | 
 81 |     bytes = new byte[in.readInt()];
 82 |     LOG.info("Loading dictionary hash: " + bytes.length + " bytes.");
 83 |     in.readFully(bytes);
 84 |     obj = new ObjectInputStream(new ByteArrayInputStream(bytes));
 85 |     try {
 86 |       dictionary = (ShiftAddXorSignedStringMap) obj.readObject();
 87 |     } catch (ClassNotFoundException e) {
 88 |       throw new RuntimeException(e);
 89 |     }
 90 |     obj.close();
 91 |     LOG.info("Finished loading.");
 92 |   }
 93 | 
 94 |   @Override
 95 |   public void write(DataOutput out) throws IOException {
 96 |   }
 97 | 
 98 |   /**
 99 |    * Simple demo program for looking up terms and term ids.
100 |    */
101 |   public static void main(String[] args) throws Exception {
102 |     if (args.length != 1) {
103 |       System.out.println("usage: [index-path]");
104 |       System.exit(-1);
105 |     }
106 | 
107 |     String indexPath = args[0];
108 | 
109 |     Configuration conf = new Configuration();
110 |     FileSystem fs = FileSystem.get(conf);
111 | 
112 |     FrontCodedDictionary dictionary = new FrontCodedDictionary();
113 |     dictionary.readFields(fs.open(new Path(indexPath)));
114 | 
115 |     int nTerms = dictionary.size();
116 |     System.out.println("nTerms: " + nTerms);
117 | 
118 |     System.out.println(" \"term word\" to lookup termid; \"termid 234\" to lookup term");
119 |     String cmd = null;
120 |     BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
121 |     System.out.print("lookup > ");
122 |     while ((cmd = stdin.readLine()) != null) {
123 | 
124 |       String[] tokens = cmd.split("\\s+");
125 | 
126 |       if (tokens.length != 2) {
127 |         System.out.println("Error: unrecognized command!");
128 |         System.out.print("lookup > ");
129 | 
130 |         continue;
131 |       }
132 | 
133 |       if (tokens[0].equals("termid")) {
134 |         int termid;
135 |         try {
136 |           termid = Integer.parseInt(tokens[1]);
137 |         } catch (Exception e) {
138 |           System.out.println("Error: invalid termid!");
139 |           System.out.print("lookup > ");
140 | 
141 |           continue;
142 |         }
143 | 
144 |         System.out.println("termid=" + termid + ", term=" + dictionary.getTerm(termid));
145 |       } else if (tokens[0].equals("term")) {
146 |         String term = tokens[1];
147 | 
148 |         System.out.println("term=" + term + ", termid=" + dictionary.getId(term));
149 |       } else {
150 |         System.out.println("Error: unrecognized command!");
151 |         System.out.print("lookup > ");
152 |         continue;
153 |       }
154 | 
155 |       System.out.print("lookup > ");
156 |     }
157 |   }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/LexicographicallySortedDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 5 |  * may not use this file except in compliance with the License. You may
 6 |  * obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 |  * implied. See the License for the specific language governing
14 |  * permissions and limitations under the License.
15 |  */
16 | 
17 | package org.clueweb.dictionary;
18 | 
19 | /**
20 |  * A lexicographically-sorted dictionary. That is, smaller term ids correspond to terms
21 |  * that are sorted lexicographically earlier.
22 |  *
23 |  * @author Jimmy Lin
24 |  */
25 | public interface LexicographicallySortedDictionary extends Dictionary {}
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/dictionary/PorterAnalyzer.java:
--------------------------------------------------------------------------------
  1 | package org.clueweb.dictionary;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Reader;
  5 | 
  6 | import org.apache.lucene.analysis.TokenStream;
  7 | import org.apache.lucene.analysis.core.LowerCaseFilter;
  8 | import org.apache.lucene.analysis.core.StopFilter;
  9 | import org.apache.lucene.analysis.en.PorterStemFilter;
 10 | import org.apache.lucene.analysis.standard.StandardFilter;
 11 | import org.apache.lucene.analysis.standard.StandardTokenizer;
 12 | import org.apache.lucene.analysis.util.CharArraySet;
 13 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 14 | import org.apache.lucene.util.Version;
 15 | 
 16 | import com.google.common.collect.Lists;
 17 | 
 18 | /**
 19 |  * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter}, 
 20 |  * {@link StopFilter}, and {@link PorterStemFilter}.
 21 |  */
 22 | public final class PorterAnalyzer extends StopwordAnalyzerBase {
 23 | 
 24 |   // Stopwords from Terrier v3.5.
 25 |   static final String[] STOPWORDS = {
 26 |     "a",
 27 |     "abaft",
 28 |     "abafter",
 29 |     "abaftest",
 30 |     "about",
 31 |     "abouter",
 32 |     "aboutest",
 33 |     "above",
 34 |     "abover",
 35 |     "abovest",
 36 |     "accordingly",
 37 |     "aer",
 38 |     "aest",
 39 |     "afore",
 40 |     "after",
 41 |     "afterer",
 42 |     "afterest",
 43 |     "afterward",
 44 |     "afterwards",
 45 |     "again",
 46 |     "against",
 47 |     "aid",
 48 |     "ain",
 49 |     "albeit",
 50 |     "all",
 51 |     "aller",
 52 |     "allest",
 53 |     "alls",
 54 |     "allyou",
 55 |     "almost",
 56 |     "along",
 57 |     "alongside",
 58 |     "already",
 59 |     "also",
 60 |     "although",
 61 |     "always",
 62 |     "amid",
 63 |     "amidst",
 64 |     "among",
 65 |     "amongst",
 66 |     "an",
 67 |     "and",
 68 |     "andor",
 69 |     "anear",
 70 |     "anent",
 71 |     "another",
 72 |     "any",
 73 |     "anybody",
 74 |     "anyhow",
 75 |     "anyone",
 76 |     "anything",
 77 |     "anywhere",
 78 |     "apart",
 79 |     "aparter",
 80 |     "apartest",
 81 |     "appear",
 82 |     "appeared",
 83 |     "appearing",
 84 |     "appears",
 85 |     "appropriate",
 86 |     "appropriated",
 87 |     "appropriater",
 88 |     "appropriates",
 89 |     "appropriatest",
 90 |     "appropriating",
 91 |     "are",
 92 |     "ares",
 93 |     "around",
 94 |     "as",
 95 |     "ases",
 96 |     "aside",
 97 |     "asides",
 98 |     "aslant",
 99 |     "astraddle",
100 |     "astraddler",
101 |     "astraddlest",
102 |     "astride",
103 |     "astrider",
104 |     "astridest",
105 |     "at",
106 |     "athwart",
107 |     "atop",
108 |     "atween",
109 |     "aught",
110 |     "aughts",
111 |     "available",
112 |     "availabler",
113 |     "availablest",
114 |     "awfully",
115 |     "b",
116 |     "be",
117 |     "became",
118 |     "because",
119 |     "become",
120 |     "becomes",
121 |     "becoming",
122 |     "becominger",
123 |     "becomingest",
124 |     "becomings",
125 |     "been",
126 |     "before",
127 |     "beforehand",
128 |     "beforehander",
129 |     "beforehandest",
130 |     "behind",
131 |     "behinds",
132 |     "below",
133 |     "beneath",
134 |     "beside",
135 |     "besides",
136 |     "better",
137 |     "bettered",
138 |     "bettering",
139 |     "betters",
140 |     "between",
141 |     "betwixt",
142 |     "beyond",
143 |     "bist",
144 |     "both",
145 |     "but",
146 |     "buts",
147 |     "by",
148 |     "by-and-by",
149 |     "byandby",
150 |     "c",
151 |     "cannot",
152 |     "canst",
153 |     "cant",
154 |     "canted",
155 |     "cantest",
156 |     "canting",
157 |     "cants",
158 |     "cer",
159 |     "certain",
160 |     "certainer",
161 |     "certainest",
162 |     "cest",
163 |     "chez",
164 |     "circa",
165 |     "co",
166 |     "come-on",
167 |     "come-ons",
168 |     "comeon",
169 |     "comeons",
170 |     "concerning",
171 |     "concerninger",
172 |     "concerningest",
173 |     "consequently",
174 |     "considering",
175 |     "could",
176 |     "couldst",
177 |     "cum",
178 |     "d",
179 |     "dday",
180 |     "ddays",
181 |     "describe",
182 |     "described",
183 |     "describes",
184 |     "describing",
185 |     "despite",
186 |     "despited",
187 |     "despites",
188 |     "despiting",
189 |     "did",
190 |     "different",
191 |     "differenter",
192 |     "differentest",
193 |     "do",
194 |     "doe",
195 |     "does",
196 |     "doing",
197 |     "doings",
198 |     "done",
199 |     "doner",
200 |     "dones",
201 |     "donest",
202 |     "dos",
203 |     "dost",
204 |     "doth",
205 |     "downs",
206 |     "downward",
207 |     "downwarder",
208 |     "downwardest",
209 |     "downwards",
210 |     "during",
211 |     "e",
212 |     "each",
213 |     "eg",
214 |     "eight",
215 |     "either",
216 |     "else",
217 |     "elsewhere",
218 |     "enough",
219 |     "ere",
220 |     "et",
221 |     "etc",
222 |     "even",
223 |     "evened",
224 |     "evenest",
225 |     "evens",
226 |     "evenser",
227 |     "evensest",
228 |     "ever",
229 |     "every",
230 |     "everybody",
231 |     "everyone",
232 |     "everything",
233 |     "everywhere",
234 |     "ex",
235 |     "except",
236 |     "excepted",
237 |     "excepting",
238 |     "excepts",
239 |     "exes",
240 |     "f",
241 |     "fact",
242 |     "facts",
243 |     "failing",
244 |     "failings",
245 |     "few",
246 |     "fewer",
247 |     "fewest",
248 |     "figupon",
249 |     "figuponed",
250 |     "figuponing",
251 |     "figupons",
252 |     "five",
253 |     "followthrough",
254 |     "for",
255 |     "forby",
256 |     "forbye",
257 |     "fore",
258 |     "forer",
259 |     "fores",
260 |     "forever",
261 |     "former",
262 |     "formerer",
263 |     "formerest",
264 |     "formerly",
265 |     "formers",
266 |     "fornenst",
267 |     "forwhy",
268 |     "four",
269 |     "fourscore",
270 |     "frae",
271 |     "from",
272 |     "fs",
273 |     "further",
274 |     "furthered",
275 |     "furtherer",
276 |     "furtherest",
277 |     "furthering",
278 |     "furthermore",
279 |     "furthers",
280 |     "g",
281 |     "get",
282 |     "gets",
283 |     "getting",
284 |     "go",
285 |     "gone",
286 |     "good",
287 |     "got",
288 |     "gotta",
289 |     "gotten",
290 |     "h",
291 |     "had",
292 |     "hadst",
293 |     "hae",
294 |     "hardly",
295 |     "has",
296 |     "hast",
297 |     "hath",
298 |     "have",
299 |     "haves",
300 |     "having",
301 |     "he",
302 |     "hence",
303 |     "her",
304 |     "hereafter",
305 |     "hereafters",
306 |     "hereby",
307 |     "herein",
308 |     "hereupon",
309 |     "hers",
310 |     "herself",
311 |     "him",
312 |     "himself",
313 |     "his",
314 |     "hither",
315 |     "hitherer",
316 |     "hitherest",
317 |     "hoo",
318 |     "hoos",
319 |     "how",
320 |     "how-do-you-do",
321 |     "howbeit",
322 |     "howdoyoudo",
323 |     "however",
324 |     "huh",
325 |     "humph",
326 |     "i",
327 |     "idem",
328 |     "idemer",
329 |     "idemest",
330 |     "ie",
331 |     "if",
332 |     "ifs",
333 |     "immediate",
334 |     "immediately",
335 |     "immediater",
336 |     "immediatest",
337 |     "in",
338 |     "inasmuch",
339 |     "inc",
340 |     "indeed",
341 |     "indicate",
342 |     "indicated",
343 |     "indicates",
344 |     "indicating",
345 |     "info",
346 |     "information",
347 |     "insofar",
348 |     "instead",
349 |     "into",
350 |     "inward",
351 |     "inwarder",
352 |     "inwardest",
353 |     "inwards",
354 |     "is",
355 |     "it",
356 |     "its",
357 |     "itself",
358 |     "j",
359 |     "k",
360 |     "l",
361 |     "latter",
362 |     "latterer",
363 |     "latterest",
364 |     "latterly",
365 |     "latters",
366 |     "layabout",
367 |     "layabouts",
368 |     "less",
369 |     "lest",
370 |     "lot",
371 |     "lots",
372 |     "lotted",
373 |     "lotting",
374 |     "m",
375 |     "main",
376 |     "make",
377 |     "many",
378 |     "mauger",
379 |     "maugre",
380 |     "mayest",
381 |     "me",
382 |     "meanwhile",
383 |     "meanwhiles",
384 |     "midst",
385 |     "midsts",
386 |     "might",
387 |     "mights",
388 |     "more",
389 |     "moreover",
390 |     "most",
391 |     "mostly",
392 |     "much",
393 |     "mucher",
394 |     "muchest",
395 |     "must",
396 |     "musth",
397 |     "musths",
398 |     "musts",
399 |     "my",
400 |     "myself",
401 |     "n",
402 |     "natheless",
403 |     "nathless",
404 |     "neath",
405 |     "neaths",
406 |     "necessarier",
407 |     "necessariest",
408 |     "necessary",
409 |     "neither",
410 |     "nethe",
411 |     "nethermost",
412 |     "never",
413 |     "nevertheless",
414 |     "nigh",
415 |     "nigher",
416 |     "nighest",
417 |     "nine",
418 |     "no",
419 |     "no-one",
420 |     "nobodies",
421 |     "nobody",
422 |     "noes",
423 |     "none",
424 |     "noone",
425 |     "nor",
426 |     "nos",
427 |     "not",
428 |     "nothing",
429 |     "nothings",
430 |     "notwithstanding",
431 |     "nowhere",
432 |     "nowheres",
433 |     "o",
434 |     "of",
435 |     "off",
436 |     "offest",
437 |     "offs",
438 |     "often",
439 |     "oftener",
440 |     "oftenest",
441 |     "oh",
442 |     "on",
443 |     "one",
444 |     "oneself",
445 |     "onest",
446 |     "ons",
447 |     "onto",
448 |     "or",
449 |     "orer",
450 |     "orest",
451 |     "other",
452 |     "others",
453 |     "otherwise",
454 |     "otherwiser",
455 |     "otherwisest",
456 |     "ought",
457 |     "oughts",
458 |     "our",
459 |     "ours",
460 |     "ourself",
461 |     "ourselves",
462 |     "out",
463 |     "outed",
464 |     "outest",
465 |     "outs",
466 |     "outside",
467 |     "outwith",
468 |     "over",
469 |     "overall",
470 |     "overaller",
471 |     "overallest",
472 |     "overalls",
473 |     "overs",
474 |     "own",
475 |     "owned",
476 |     "owning",
477 |     "owns",
478 |     "owt",
479 |     "p",
480 |     "particular",
481 |     "particularer",
482 |     "particularest",
483 |     "particularly",
484 |     "particulars",
485 |     "per",
486 |     "perhaps",
487 |     "plaintiff",
488 |     "please",
489 |     "pleased",
490 |     "pleases",
491 |     "plenties",
492 |     "plenty",
493 |     "pro",
494 |     "probably",
495 |     "provide",
496 |     "provided",
497 |     "provides",
498 |     "providing",
499 |     "q",
500 |     "qua",
501 |     "que",
502 |     "quite",
503 |     "r",
504 |     "rath",
505 |     "rathe",
506 |     "rather",
507 |     "rathest",
508 |     "re",
509 |     "really",
510 |     "regarding",
511 |     "relate",
512 |     "related",
513 |     "relatively",
514 |     "res",
515 |     "respecting",
516 |     "respectively",
517 |     "s",
518 |     "said",
519 |     "saider",
520 |     "saidest",
521 |     "same",
522 |     "samer",
523 |     "sames",
524 |     "samest",
525 |     "sans",
526 |     "sanserif",
527 |     "sanserifs",
528 |     "sanses",
529 |     "saved",
530 |     "sayid",
531 |     "sayyid",
532 |     "seem",
533 |     "seemed",
534 |     "seeminger",
535 |     "seemingest",
536 |     "seemings",
537 |     "seems",
538 |     "send",
539 |     "sent",
540 |     "senza",
541 |     "serious",
542 |     "seriouser",
543 |     "seriousest",
544 |     "seven",
545 |     "several",
546 |     "severaler",
547 |     "severalest",
548 |     "shall",
549 |     "shalled",
550 |     "shalling",
551 |     "shalls",
552 |     "she",
553 |     "should",
554 |     "shoulded",
555 |     "shoulding",
556 |     "shoulds",
557 |     "since",
558 |     "sine",
559 |     "sines",
560 |     "sith",
561 |     "six",
562 |     "so",
563 |     "sobeit",
564 |     "soer",
565 |     "soest",
566 |     "some",
567 |     "somebody",
568 |     "somehow",
569 |     "someone",
570 |     "something",
571 |     "sometime",
572 |     "sometimer",
573 |     "sometimes",
574 |     "sometimest",
575 |     "somewhat",
576 |     "somewhere",
577 |     "stop",
578 |     "stopped",
579 |     "such",
580 |     "summat",
581 |     "sup",
582 |     "supped",
583 |     "supping",
584 |     "sups",
585 |     "syn",
586 |     "syne",
587 |     "t",
588 |     "ten",
589 |     "than",
590 |     "that",
591 |     "the",
592 |     "thee",
593 |     "their",
594 |     "theirs",
595 |     "them",
596 |     "themselves",
597 |     "then",
598 |     "thence",
599 |     "thener",
600 |     "thenest",
601 |     "there",
602 |     "thereafter",
603 |     "thereby",
604 |     "therefore",
605 |     "therein",
606 |     "therer",
607 |     "therest",
608 |     "thereupon",
609 |     "these",
610 |     "they",
611 |     "thine",
612 |     "thing",
613 |     "things",
614 |     "this",
615 |     "thises",
616 |     "thorough",
617 |     "thorougher",
618 |     "thoroughest",
619 |     "thoroughly",
620 |     "those",
621 |     "thou",
622 |     "though",
623 |     "thous",
624 |     "thouses",
625 |     "three",
626 |     "thro",
627 |     "through",
628 |     "througher",
629 |     "throughest",
630 |     "throughout",
631 |     "thru",
632 |     "thruer",
633 |     "thruest",
634 |     "thus",
635 |     "thy",
636 |     "thyself",
637 |     "till",
638 |     "tilled",
639 |     "tilling",
640 |     "tills",
641 |     "to",
642 |     "together",
643 |     "too",
644 |     "toward",
645 |     "towarder",
646 |     "towardest",
647 |     "towards",
648 |     "two",
649 |     "u",
650 |     "umpteen",
651 |     "under",
652 |     "underneath",
653 |     "unless",
654 |     "unlike",
655 |     "unliker",
656 |     "unlikest",
657 |     "until",
658 |     "unto",
659 |     "up",
660 |     "upon",
661 |     "uponed",
662 |     "uponing",
663 |     "upons",
664 |     "upped",
665 |     "upping",
666 |     "ups",
667 |     "us",
668 |     "use",
669 |     "used",
670 |     "usedest",
671 |     "username",
672 |     "usually",
673 |     "v",
674 |     "various",
675 |     "variouser",
676 |     "variousest",
677 |     "verier",
678 |     "veriest",
679 |     "versus",
680 |     "very",
681 |     "via",
682 |     "vis-a-vis",
683 |     "vis-a-viser",
684 |     "vis-a-visest",
685 |     "viz",
686 |     "vs",
687 |     "w",
688 |     "was",
689 |     "wast",
690 |     "we",
691 |     "were",
692 |     "wert",
693 |     "what",
694 |     "whatever",
695 |     "whateverer",
696 |     "whateverest",
697 |     "whatsoever",
698 |     "whatsoeverer",
699 |     "whatsoeverest",
700 |     "wheen",
701 |     "when",
702 |     "whenas",
703 |     "whence",
704 |     "whencesoever",
705 |     "whenever",
706 |     "whensoever",
707 |     "where",
708 |     "whereafter",
709 |     "whereas",
710 |     "whereby",
711 |     "wherefrom",
712 |     "wherein",
713 |     "whereinto",
714 |     "whereof",
715 |     "whereon",
716 |     "wheresoever",
717 |     "whereto",
718 |     "whereupon",
719 |     "wherever",
720 |     "wherewith",
721 |     "wherewithal",
722 |     "whether",
723 |     "which",
724 |     "whichever",
725 |     "whichsoever",
726 |     "while",
727 |     "whiles",
728 |     "whilst",
729 |     "whither",
730 |     "whithersoever",
731 |     "whoever",
732 |     "whomever",
733 |     "whose",
734 |     "whoso",
735 |     "whosoever",
736 |     "why",
737 |     "with",
738 |     "withal",
739 |     "within",
740 |     "without",
741 |     "would",
742 |     "woulded",
743 |     "woulding",
744 |     "woulds",
745 |     "x",
746 |     "y",
747 |     "ye",
748 |     "yet",
749 |     "yon",
750 |     "yond",
751 |     "yonder",
752 |     "you",
753 |     "your",
754 |     "yours",
755 |     "yourself",
756 |     "yourselves",
757 |     "z",
758 |     "zillion",
759 |   };
760 | 
761 |   /** Default maximum allowed token length */
762 |   public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
763 | 
764 |   private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
765 | 
766 |   public static final CharArraySet STOP_WORDS_SET = new CharArraySet(Version.LUCENE_43,
767 |       Lists.newArrayList(STOPWORDS), true);
768 | 
769 |   public PorterAnalyzer() {
770 |     super(Version.LUCENE_43, STOP_WORDS_SET);
771 |   }
772 | 
773 |   /**
774 |    * Set maximum allowed token length. If a token is seen that exceeds this length then it is
775 |    * discarded. This setting only takes effect the next time tokenStream or tokenStream is called.
776 |    */
777 |   public void setMaxTokenLength(int length) {
778 |     maxTokenLength = length;
779 |   }
780 |     
781 |   public int getMaxTokenLength() {
782 |     return maxTokenLength;
783 |   }
784 | 
785 |   @Override
786 |   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
787 |     final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
788 |     src.setMaxTokenLength(maxTokenLength);
789 |     TokenStream tok = new StandardFilter(matchVersion, src);
790 |     tok = new LowerCaseFilter(matchVersion, tok);
791 |     tok = new StopFilter(matchVersion, tok, stopwords);
792 |     tok = new PorterStemFilter(tok);
793 |     return new TokenStreamComponents(src, tok) {
794 |       @Override
795 |       protected void setReader(final Reader reader) throws IOException {
796 |         src.setMaxTokenLength(PorterAnalyzer.this.maxTokenLength);
797 |         super.setReader(reader);
798 |       }
799 |     };
800 |   }
801 | }
802 | 


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/util/AnalyzerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.clueweb.util;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.util.Version;
 5 | import org.clueweb.dictionary.PorterAnalyzer;
 6 | 
 7 | public class AnalyzerFactory {
 8 | 
 9 |   public static Analyzer getAnalyzer(String analyzerType) {
10 |     if (analyzerType.equals("standard")) {
11 |       return new org.apache.lucene.analysis.standard.StandardAnalyzer(Version.LUCENE_43);
12 |     }
13 | 
14 |     if (analyzerType.equals("porter")) {
15 |       return new PorterAnalyzer();
16 |     }
17 | 
18 |     return null;
19 |   }
20 | 
21 |   public static String getOptions() {
22 |     return "standard|porter";
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/main/java/org/clueweb/util/QuickSort.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  5 |  * may not use this file except in compliance with the License. You may
  6 |  * obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 13 |  * implied. See the License for the specific language governing
 14 |  * permissions and limitations under the License.
 15 |  */
 16 | 
 17 | package org.clueweb.util;
 18 | 
 19 | public class QuickSort {
 20 | 
 21 |     // quicksort a[left] to a[right]
 22 |     public static void quicksort(short[] keys, int[] counts, int left, int right) {
 23 |         if (right <= left) return;
 24 |         int i = partition(keys, counts, left, right);
 25 |         quicksort(keys, counts, left, i-1);
 26 |         quicksort(keys, counts, i+1, right);
 27 |     }
 28 |  
 29 |  // quicksort a[left] to a[right]
 30 |     public static void quicksort(int[] keys, int[] counts, short[] counts2, int left, int right) {
 31 |         if (right <= left) return;
 32 |         int i = partition(keys, counts, counts2, left, right);
 33 |         quicksort(keys, counts, counts2, left, i-1);
 34 |         quicksort(keys, counts, counts2, i+1, right);
 35 |     }
 36 |     
 37 |     
 38 |     public static void quicksort(short[] keys, int[] counts, Object[] counts2, int left, int right) {
 39 |         if (right <= left) return;
 40 |         int i = partition(keys, counts, counts2, left, right);
 41 |         quicksort(keys, counts, counts2, left, i-1);
 42 |         quicksort(keys, counts, counts2, i+1, right);
 43 |     }
 44 |     
 45 |     public static void quicksortWithSecondary(int[] keys, int[] counts, short[] counts2, int left, int right) {
 46 |         if (right <= left) return;
 47 |         int i = partitionWithSecondary(keys, counts, counts2, left, right);
 48 |         quicksortWithSecondary(keys, counts, counts2, left, i-1);
 49 |         quicksortWithSecondary(keys, counts, counts2, i+1, right);
 50 |     }
 51 |     
 52 |     public static void quicksortWithSecondary(int[] keys, int[] counts, long[] counts2, int left, int right) {
 53 |         if (right <= left) return;
 54 |         int i = partitionWithSecondary(keys, counts, counts2, left, right);
 55 |         quicksortWithSecondary(keys, counts, counts2, left, i-1);
 56 |         quicksortWithSecondary(keys, counts, counts2, i+1, right);
 57 |     }
 58 |     
 59 |     public static void quicksort(int[] keys, int[] counts, int left, int right) {
 60 |         if (right <= left) return;
 61 |         int i = partition(keys, counts, left, right);
 62 |         quicksort(keys, counts, left, i-1);
 63 |         quicksort(keys, counts, i+1, right);
 64 |     }
 65 |     
 66 |     public static void quicksort(Object[] keys, int[] counts, int left, int right) {
 67 |         if (right <= left) return;
 68 |         int i = partition(keys, counts, left, right);
 69 |         quicksort(keys, counts, left, i-1);
 70 |         quicksort(keys, counts, i+1, right);
 71 |     }
 72 | 
 73 |     // partition a[left] to a[right], assumes left < right
 74 |     private static int partition(short[] keys, int[] counts, int left, int right) {
 75 |         int i = left - 1;
 76 |         int j = right;
 77 |         while (true) {
 78 |             while (counts[++i] < counts[right])      // find item on left to swap
 79 |                 ;                               // a[right] acts as sentinel
 80 |             while (counts[right] < counts[--j])      // find item on right to swap
 81 |                 if (j == left) break;           // don't go out-of-bounds
 82 |             if (i >= j) break;                  // check if pointers cross
 83 |             int swap = counts[i];
 84 |             counts[i] = counts[j];
 85 |             counts[j] = swap;                      // swap two elements into place
 86 |             
 87 |             short tmp = keys[i];
 88 |             keys[i] = keys[j];
 89 |             keys[j] = tmp; 
 90 |         }
 91 |         int swap = counts[i];
 92 |         counts[i] = counts[right];
 93 |         counts[right] = swap;
 94 |         short tmp = keys[i];
 95 |         keys[i] = keys[right];
 96 |         keys[right] = tmp;
 97 |         return i;
 98 |     }
 99 |     
100 |     private static int partition(Object[] keys, int[] counts, int left, int right) {
101 |         int i = left - 1;
102 |         int j = right;
103 |         while (true) {
104 |             while (counts[++i] < counts[right])      // find item on left to swap
105 |                 ;                               // a[right] acts as sentinel
106 |             while (counts[right] < counts[--j])      // find item on right to swap
107 |                 if (j == left) break;           // don't go out-of-bounds
108 |             if (i >= j) break;                  // check if pointers cross
109 |             int swap = counts[i];
110 |             counts[i] = counts[j];
111 |             counts[j] = swap;                      // swap two elements into place
112 |             
113 |             Object tmp = keys[i];
114 |             keys[i] = keys[j];
115 |             keys[j] = tmp; 
116 |         }
117 |         int swap = counts[i];
118 |         counts[i] = counts[right];
119 |         counts[right] = swap;
120 |         Object tmp = keys[i];
121 |         keys[i] = keys[right];
122 |         keys[right] = tmp;
123 |         return i;
124 |     }
125 |     
126 |     private static int partition(int[] keys, int[] counts, short[] counts2, int left, int right) {
127 |         int i = left - 1;
128 |         int j = right;
129 |         while (true) {
130 |             while (counts[++i] < counts[right])      // find item on left to swap
131 |                 ;                               // a[right] acts as sentinel
132 |             while (counts[right] < counts[--j])      // find item on right to swap
133 |                 if (j == left) break;           // don't go out-of-bounds
134 |             if (i >= j) break;                  // check if pointers cross
135 |             int swap = counts[i];
136 |             counts[i] = counts[j];
137 |             counts[j] = swap;                      // swap two elements into place
138 |             
139 |             int tmp = keys[i];
140 |             keys[i] = keys[j];
141 |             keys[j] = tmp; 
142 |             
143 |             short tmp2 = counts2[i];
144 |             counts2[i] = counts2[j];
145 |             counts2[j] = tmp2;
146 |             
147 |         }
148 |         int swap = counts[i];
149 |         counts[i] = counts[right];
150 |         counts[right] = swap;
151 |         
152 |         int tmp = keys[i];
153 |         keys[i] = keys[right];
154 |         keys[right] = tmp;
155 |         
156 |         short tmp2 = counts2[i];
157 |         counts2[i] = counts2[right];
158 |         counts2[right] = tmp2;
159 |         return i;
160 |     }
161 |     
162 |     private static int partition(short[] keys, int[] counts, Object[] counts2, int left, int right) {
163 |         int i = left - 1;
164 |         int j = right;
165 |         while (true) {
166 |             while (counts[++i] < counts[right])      // find item on left to swap
167 |                 ;                               // a[right] acts as sentinel
168 |             while (counts[right] < counts[--j])      // find item on right to swap
169 |                 if (j == left) break;           // don't go out-of-bounds
170 |             if (i >= j) break;                  // check if pointers cross
171 |             int swap = counts[i];
172 |             counts[i] = counts[j];
173 |             counts[j] = swap;                      // swap two elements into place
174 |             
175 |             short tmp = keys[i];
176 |             keys[i] = keys[j];
177 |             keys[j] = tmp; 
178 |             
179 |             Object tmp2 = counts2[i];
180 |             counts2[i] = counts2[j];
181 |             counts2[j] = tmp2;
182 |             
183 |         }
184 |         int swap = counts[i];
185 |         counts[i] = counts[right];
186 |         counts[right] = swap;
187 |         
188 |         short tmp = keys[i];
189 |         keys[i] = keys[right];
190 |         keys[right] = tmp;
191 |         
192 |         Object tmp2 = counts2[i];
193 |         counts2[i] = counts2[right];
194 |         counts2[right] = tmp2;
195 |         return i;
196 |     }
197 |     
198 |     private static int partitionWithSecondary(int[] keys, int[] counts, short[] counts2, int left, int right) {
199 |         int i = left - 1;
200 |         int j = right;
201 |         while (true) {
202 |           do{
203 |             i++;
204 |           }while (counts[i] < counts[right] || (counts[i] == counts[right] && keys[i] < keys[right]))      // find item on left to swap
205 |                 ;                               // a[right] acts as sentinel
206 |           
207 |             //while (counts[++i] < counts[right])      // find item on left to swap
208 |             //    ;                               // a[right] acts as sentinel
209 |             do{
210 |               j--;
211 |             }while (j!=left && (counts[right] < counts[j] || (counts[right] == counts[j] && keys[right]<keys[j] )));      // find item on right to swap
212 |           
213 |           //while (counts[right] < counts[--j])      // find item on right to swap
214 |             //    if (j == left) break;           // don't go out-of-bounds
215 |             if (i >= j) break;                  // check if pointers cross
216 |             int swap = counts[i];
217 |             counts[i] = counts[j];
218 |             counts[j] = swap;                      // swap two elements into place
219 |             
220 |             int tmp = keys[i];
221 |             keys[i] = keys[j];
222 |             keys[j] = tmp; 
223 |             
224 |             short tmp2 = counts2[i];
225 |             counts2[i] = counts2[j];
226 |             counts2[j] = tmp2;
227 |             
228 |         }
229 |         int swap = counts[i];
230 |         counts[i] = counts[right];
231 |         counts[right] = swap;
232 |         
233 |         int tmp = keys[i];
234 |         keys[i] = keys[right];
235 |         keys[right] = tmp;
236 |         
237 |         short tmp2 = counts2[i];
238 |         counts2[i] = counts2[right];
239 |         counts2[right] = tmp2;
240 |         return i;
241 |     }
242 |     
243 |     private static int partitionWithSecondary(int[] keys, int[] counts, long[] counts2, int left, int right) {
244 |         int i = left - 1;
245 |         int j = right;
246 |         while (true) {
247 |           do{
248 |             i++;
249 |           }while (counts[i] < counts[right] || (counts[i] == counts[right] && keys[i] < keys[right]))      // find item on left to swap
250 |                 ;                               // a[right] acts as sentinel
251 |           
252 |             //while (counts[++i] < counts[right])      // find item on left to swap
253 |             //    ;                               // a[right] acts as sentinel
254 |             do{
255 |               j--;
256 |             }while (j!=left && (counts[right] < counts[j] || (counts[right] == counts[j] && keys[right]<keys[j] )));      // find item on right to swap
257 |           
258 |           //while (counts[right] < counts[--j])      // find item on right to swap
259 |             //    if (j == left) break;           // don't go out-of-bounds
260 |             if (i >= j) break;                  // check if pointers cross
261 |             int swap = counts[i];
262 |             counts[i] = counts[j];
263 |             counts[j] = swap;                      // swap two elements into place
264 |             
265 |             int tmp = keys[i];
266 |             keys[i] = keys[j];
267 |             keys[j] = tmp; 
268 |             
269 |             long tmp2 = counts2[i];
270 |             counts2[i] = counts2[j];
271 |             counts2[j] = tmp2;
272 |             
273 |         }
274 |         int swap = counts[i];
275 |         counts[i] = counts[right];
276 |         counts[right] = swap;
277 |         
278 |         int tmp = keys[i];
279 |         keys[i] = keys[right];
280 |         keys[right] = tmp;
281 |         
282 |         long tmp2 = counts2[i];
283 |         counts2[i] = counts2[right];
284 |         counts2[right] = tmp2;
285 |         return i;
286 |     }
287 |     
288 |     private static int partition(int[] keys, int[] counts, int left, int right) {
289 |         int i = left - 1;
290 |         int j = right;
291 |         while (true) {
292 |             while (counts[++i] < counts[right])      // find item on left to swap
293 |                 ;                               // a[right] acts as sentinel
294 |             while (counts[right] < counts[--j])      // find item on right to swap
295 |                 if (j == left) break;           // don't go out-of-bounds
296 |             if (i >= j) break;                  // check if pointers cross
297 |             int swap = counts[i];
298 |             counts[i] = counts[j];
299 |             counts[j] = swap;                      // swap two elements into place
300 |             
301 |             int tmp = keys[i];
302 |             keys[i] = keys[j];
303 |             keys[j] = tmp; 
304 |         }
305 |         int swap = counts[i];
306 |         counts[i] = counts[right];
307 |         counts[right] = swap;
308 |         
309 |         int tmp = keys[i];
310 |         keys[i] = keys[right];
311 |         keys[right] = tmp;
312 |         
313 |         return i;
314 |     }
315 | }
316 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 | 
5 | # Print the date in ISO 8601 format
6 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c{1} - %m%n
7 | log4j.logger.com.ning.http.client=WARN
8 | 


--------------------------------------------------------------------------------
/src/test/java/org/clueweb/data/PForDocVectorTest.java:
--------------------------------------------------------------------------------
  1 | package org.clueweb.data;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | import static org.junit.Assert.assertTrue;
  5 | 
  6 | import java.util.Random;
  7 | 
  8 | import junit.framework.JUnit4TestAdapter;
  9 | import me.lemire.integercompression.FastPFOR;
 10 | import me.lemire.integercompression.IntWrapper;
 11 | 
 12 | import org.junit.Test;
 13 | 
 14 | import tl.lin.data.array.IntArrayWritable;
 15 | 
 16 | public class PForDocVectorTest {
 17 |   private static final Random RANDOM = new Random();
 18 | 
 19 |   @Test
 20 |   public void testPFor1() throws Exception {
 21 |     int len = 256;
 22 |     FastPFOR p4 = new FastPFOR();
 23 |     int[] doc = new int[len];
 24 |     for (int i = 0; i<len; i++) {
 25 |       doc[i] = RANDOM.nextInt(10000);
 26 |     }
 27 | 
 28 |     IntWrapper inPos = new IntWrapper(0);
 29 |     IntWrapper outPos = new IntWrapper(0);
 30 | 
 31 |     int[] out = new int[len];
 32 |     p4.compress(doc, inPos, doc.length, out, outPos);
 33 | 
 34 |     int[] trimmedOut = new int[outPos.get()];
 35 |     System.arraycopy(out, 0, trimmedOut, 0, outPos.get());
 36 | 
 37 |     assertTrue(trimmedOut.length < doc.length);
 38 | 
 39 |     IntWrapper cinPos = new IntWrapper(0);
 40 |     IntWrapper coutPos = new IntWrapper(0);
 41 |     int[] reconstructed = new int[len];
 42 |     int r = RANDOM.nextInt();
 43 |     // Interesting behavior of the PFor decompressor: r doesn't matter.
 44 |     p4.uncompress(trimmedOut, cinPos, r, reconstructed, coutPos);
 45 |     
 46 |     assertEquals(doc.length, reconstructed.length);
 47 |     for (int i = 0; i < doc.length; i++) {
 48 |       assertEquals(doc[i], reconstructed[i]);
 49 |     }
 50 |   }
 51 | 
 52 |   @Test
 53 |   public void testPFor2() throws Exception {
 54 |     int len = 23;
 55 |     FastPFOR p4 = new FastPFOR();
 56 |     int[] doc = new int[len];
 57 |     for (int i = 0; i<len; i++) {
 58 |       doc[i] = RANDOM.nextInt(10000);
 59 |     }
 60 | 
 61 |     IntWrapper inPos = new IntWrapper(0);
 62 |     IntWrapper outPos = new IntWrapper(0);
 63 | 
 64 |     int[] out = new int[len];
 65 |     // We're purposely only compressing 23 values here, which is smaller than a PFor block size,
 66 |     // just to see what would happen...
 67 |     p4.compress(doc, inPos, len, out, outPos);
 68 | 
 69 |     assertEquals(0, inPos.get());
 70 |     // Indeed, the PFor compressor refuses to process the input stream.
 71 |     assertEquals(1, outPos.get());
 72 |     // But why has the output stream incremented?
 73 |   }
 74 | 
 75 |   @Test
 76 |   public void testPFor3() throws Exception {
 77 |     int len = 129;
 78 |     FastPFOR p4 = new FastPFOR();
 79 |     int[] doc = new int[len];
 80 |     for (int i = 0; i<len; i++) {
 81 |       doc[i] = RANDOM.nextInt(10000);
 82 |     }
 83 | 
 84 |     IntWrapper inPos = new IntWrapper(0);
 85 |     IntWrapper outPos = new IntWrapper(0);
 86 | 
 87 |     int[] out = new int[len];
 88 |     // We're purposely only compressing 129 values here, which is one more than a PFor block.
 89 |     p4.compress(doc, inPos, len, out, outPos);
 90 | 
 91 |     assertEquals(128, inPos.get());
 92 |     // Indeed, the PFor compressor processes one block and that's it. (Even though we told it to
 93 |     // compress all 129 values).
 94 |   }
 95 | 
 96 |   @Test
 97 |   public void testPFor4() throws Exception {
 98 |     int len = 128 * 4 + 1;
 99 |     FastPFOR p4 = new FastPFOR();
100 |     int[] doc = new int[len];
101 |     for (int i = 0; i<len; i++) {
102 |       doc[i] = RANDOM.nextInt(10000);
103 |     }
104 | 
105 |     IntWrapper inPos = new IntWrapper(0);
106 |     IntWrapper outPos = new IntWrapper(0);
107 | 
108 |     int[] out = new int[len];
109 |     // There are multiple blocks here, but we're tell it to compress only one block.
110 |     p4.compress(doc, inPos, 128, out, outPos);
111 | 
112 |     assertEquals(128, inPos.get());
113 |     // And indeed, the compression complies.
114 | 
115 |     // Now tell it to compress the rest.
116 |     p4.compress(doc, inPos, 128 * 3 + 1, out, outPos);
117 |     assertEquals(128 * 4, inPos.get());
118 |     // The rest of the blocks are compressed, but not the leftover integer.
119 |   }
120 | 
121 |   @Test
122 |   public void testSerializeRandom() throws Exception {
123 |     testSerializeWithLength(10);
124 |     testSerializeWithLength(25);
125 |     testSerializeWithLength(127);
126 |     testSerializeWithLength(128);
127 |     testSerializeWithLength(129);
128 |     testSerializeWithLength(255);
129 |     testSerializeWithLength(256);
130 |     testSerializeWithLength(257);
131 |     testSerializeWithLength(1024);
132 | 
133 |     for (int i = 0; i < 100; i++) {
134 |       testSerializeWithLength(RANDOM.nextInt(400));
135 |     }
136 |   }
137 | 
138 |   public void testSerializeWithLength(int doclength) throws Exception {
139 |     int[] doc = new int[doclength];
140 |     for (int i = 0; i<doclength; i++) {
141 |       doc[i] = RANDOM.nextInt(10000);
142 |     }
143 | 
144 |     IntArrayWritable ints = new IntArrayWritable();
145 |     PForDocVector.toIntArrayWritable(ints, doc, doclength);
146 | 
147 |     PForDocVector v = new PForDocVector();
148 |     PForDocVector.fromIntArrayWritable(ints, v);
149 | 
150 |     assertEquals(doclength, v.getLength());
151 |     for (int i = 0; i < doc.length; i++) {
152 |       assertEquals(doc[i], v.getTermIds()[i]);
153 |     }
154 |   }
155 | 
156 |   @Test
157 |   public void testSerializeRandomArrayTooLong() throws Exception {
158 |     testSerializeWithArrayTooLong(10);
159 |     testSerializeWithArrayTooLong(25);
160 |     testSerializeWithArrayTooLong(127);
161 |     testSerializeWithArrayTooLong(128);
162 |     testSerializeWithArrayTooLong(129);
163 |     testSerializeWithArrayTooLong(255);
164 |     testSerializeWithArrayTooLong(256);
165 |     testSerializeWithArrayTooLong(257);
166 |     testSerializeWithArrayTooLong(1024);
167 | 
168 |     for (int i = 0; i < 100; i++) {
169 |       testSerializeWithArrayTooLong(RANDOM.nextInt(400));
170 |     }
171 |   }
172 | 
173 |   public void testSerializeWithArrayTooLong(int doclength) throws Exception {
174 |     int[] doc = new int[doclength + RANDOM.nextInt(20)];
175 |     for (int i = 0; i<doclength; i++) {
176 |       doc[i] = RANDOM.nextInt(10000);
177 |     }
178 | 
179 |     IntArrayWritable ints = new IntArrayWritable();
180 |     PForDocVector.toIntArrayWritable(ints, doc, doclength);
181 | 
182 |     PForDocVector v = new PForDocVector();
183 |     PForDocVector.fromIntArrayWritable(ints, v);
184 | 
185 |     assertEquals(doclength, v.getLength());
186 |     for (int i = 0; i < doclength; i++) {
187 |       assertEquals(doc[i], v.getTermIds()[i]);
188 |     }
189 |   }
190 |   
191 |   // Make sure serializing an empty document works.
192 |   @Test
193 |   public void testSerializeEmpty1() throws Exception {
194 |     IntArrayWritable ints = new IntArrayWritable();
195 |     PForDocVector.toIntArrayWritable(ints, new int[] {}, 0);
196 | 
197 |     PForDocVector v = new PForDocVector();
198 |     PForDocVector.fromIntArrayWritable(ints, v);
199 | 
200 |     assertEquals(0, v.getLength());
201 |     assertEquals(0, v.getTermIds().length);
202 |   }
203 | 
204 |   // Make sure serializing a "null" document works.
205 |   @Test
206 |   public void testSerializeEmpty2() throws Exception {
207 |     IntArrayWritable ints = new IntArrayWritable();
208 |     PForDocVector.toIntArrayWritable(ints, null, -1);
209 | 
210 |     PForDocVector v = new PForDocVector();
211 |     PForDocVector.fromIntArrayWritable(ints, v);
212 | 
213 |     assertEquals(0, v.getLength());
214 |     assertEquals(0, v.getTermIds().length);
215 |   }
216 | 
217 |   public static junit.framework.Test suite() {
218 |     return new JUnit4TestAdapter(PForDocVectorTest.class);
219 |   }
220 | }


--------------------------------------------------------------------------------
/src/test/java/org/clueweb/data/VByteDocVectorTest.java:
--------------------------------------------------------------------------------
 1 | package org.clueweb.data;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.Random;
 6 | 
 7 | import junit.framework.JUnit4TestAdapter;
 8 | 
 9 | import org.apache.hadoop.io.BytesWritable;
10 | import org.junit.Test;
11 | 
12 | public class VByteDocVectorTest {
13 |   private static final Random RANDOM = new Random();
14 | 
15 |   @Test
16 |   public void testSerialize1() throws Exception {
17 |     int[] doc = new int[256];
18 |     for (int i = 0; i<256; i++) {
19 |       doc[i] = RANDOM.nextInt(10000);
20 |     }
21 | 
22 |     BytesWritable bytes = new BytesWritable();
23 |     VByteDocVector.toBytesWritable(bytes, doc, 256);
24 | 
25 |     VByteDocVector v = new VByteDocVector();
26 |     VByteDocVector.fromBytesWritable(bytes, v);
27 | 
28 |     assertEquals(doc.length, v.getLength());
29 |     for (int i = 0; i < doc.length; i++) {
30 |       assertEquals(doc[i], v.getTermIds()[i]);
31 |     }
32 |   }
33 | 
34 |   // Make sure serializing an empty document works.
35 |   @Test
36 |   public void testSerialize2() throws Exception {
37 |     BytesWritable bytes = new BytesWritable();
38 |     VByteDocVector.toBytesWritable(bytes,new int[] {}, 0);
39 | 
40 |     VByteDocVector v = new VByteDocVector();
41 |     VByteDocVector.fromBytesWritable(bytes, v);
42 | 
43 |     assertEquals(0, v.getLength());
44 |     assertEquals(0, v.getTermIds().length);
45 |   }
46 | 
47 |   // Make sure serializing a "null" document works.
48 |   @Test
49 |   public void testSerialize3() throws Exception {
50 |     BytesWritable bytes = new BytesWritable();
51 |     VByteDocVector.toBytesWritable(bytes,new int[] {}, 0);
52 | 
53 |     VByteDocVector v = new VByteDocVector();
54 |     VByteDocVector.fromBytesWritable(bytes, v);
55 | 
56 |     assertEquals(0, v.getLength());
57 |     assertEquals(0, v.getTermIds().length);
58 |   }
59 | 
60 |   public static junit.framework.Test suite() {
61 |     return new JUnit4TestAdapter(VByteDocVectorTest.class);
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/java/org/clueweb/dictionary/PorterAnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package org.clueweb.dictionary;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | import junit.framework.JUnit4TestAdapter;
 8 | 
 9 | import org.apache.lucene.analysis.Analyzer;
10 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
11 | import org.apache.lucene.util.Version;
12 | import org.junit.Test;
13 | 
14 | import tl.lin.lucene.AnalyzerUtils;
15 | 
16 | import com.google.common.base.Joiner;
17 | 
18 | public class PorterAnalyzerTest {
19 |   @Test
20 |   public void test1() throws Exception {
21 |     Analyzer analyzer = new PorterAnalyzer();
22 |     List<String> tokens = AnalyzerUtils.parse(analyzer,
23 |         "The U.S. Dept. of Justice has announced that Panasonic and its subsidiary Sanyo have been fined $56.5 million for their roles in price fixing conspiracies involving battery cells and car parts.");
24 | 
25 |     System.out.println(Joiner.on(",").join(tokens));
26 |     assertEquals(19, tokens.size());
27 |   }
28 | 
29 |   @Test
30 |   public void test2() throws Exception {
31 |     Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
32 |     List<String> tokens = AnalyzerUtils.parse(analyzer,
33 |         "The U.S. Dept. of Justice has announced that Panasonic and its subsidiary Sanyo have been fined $56.5 million for their roles in price fixing conspiracies involving battery cells and car parts.");
34 | 
35 |     System.out.println(Joiner.on(",").join(tokens));
36 |     assertEquals(23, tokens.size());
37 |   }
38 | 
39 |   public static junit.framework.Test suite() {
40 |     return new JUnit4TestAdapter(PorterAnalyzerTest.class);
41 |   }
42 | }


--------------------------------------------------------------------------------