├── .gitattributes
├── .gitignore
├── README.md
├── clojure-examples
    ├── .gitignore
    ├── README.md
    ├── project.clj
    ├── src
    │   └── clojure_examples
    │   │   └── core.clj
    └── test
    │   └── clojure_examples
    │       └── core_test.clj
├── pom.xml
└── src
    └── main
        └── java
            ├── edu
                └── cmu
                │   └── lemurproject
                │       ├── WarcFileInputFormat.java
                │       ├── WarcFileRecordReader.java
                │       ├── WarcHTMLResponseRecord.java
                │       ├── WarcRecord.java
                │       └── WritableWarcRecord.java
            └── org
                └── commoncrawl
                    └── examples
                        └── java_warc
                            ├── IProcessWarcRecord.java
                            ├── ReadS3Bucket.java
                            ├── ReadWARC.java
                            └── SampleProcessWarcRecord.java


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.class
 3 | target
 4 | example-warc*
 5 | *~
 6 | clojure-examples/target
 7 | clojure-examples/.lein*
 8 | clojure-examples/clojure-examples-for-common-crawl.i*
 9 | 
10 | # Eclipse ignores
11 | .classpath
12 | .project
13 | .settings/
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Java and Clojure examples for processing Common Crawl WARC files
  2 | 
  3 | Mark Watson 2014/1/26
  4 | 
  5 | There are two Java examples and one Clojure example for now (more to come):
  6 | 
  7 | - ReadWARC - reads a local WARC file that was manually copied from S3 storage to your laptop
  8 | - ReadS3Bucket - this should be run on an EC2 instance for fast access to S3
  9 | - clojure-examples/src/clojure-examples/core.clj - reads a local WARC file that was manually copied from S3 storage to your laptop
 10 | 
 11 | A JDK 1.7 or later is required (JDK 1.6 will not work).
 12 | 
 13 | Special thanks to the developers of the edu.cmu.lemurproject package from Carnegie Mellon University. This code
 14 | reads WARC files and the source code is included in the src subdirectory.
 15 | 
 16 | I have just started experimenting with Common Crawl data. I plan on adding a Hadoop/Elastic MapReduce example
 17 | and also more examples using other JVM languages like Clojure and JRuby.
 18 | 
 19 | ## ReadWARC
 20 | 
 21 | Assuming that you have the aws command line tools installed, you can list the contents of a crawl using:
 22 | 
 23 | ````````
 24 | aws s3 ls s3://commoncrawl/crawl-data/CC-MAIN-2014-10/ --recursive | head -6
 25 | ````````
 26 | 
 27 | You can copy one segment to your laptop (segment files are less than 1 gigabytes) using:
 28 | 
 29 | ````````
 30 | aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2014-10/segments/1394023864559/warc/CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz .
 31 | ````````
 32 | 
 33 | Then run this example using:
 34 | 
 35 | ````````
 36 | mvn install
 37 | mvn exec:java -Dexec.mainClass=org.commoncrawl.examples.java_warc.ReadWARC
 38 | ````````
 39 | 
 40 | 
 41 | ## ReadS3Bucket
 42 | 
 43 | You can set the maximum number of segment files to process using the **max** argument:
 44 | 
 45 | ````````
 46 | public class ReadS3Bucket {
 47 |   static public void process(AmazonS3 s3, String bucketName, String prefix, int max) {
 48 | ````````
 49 | 
 50 | As you can see in the example code, I pass the bucket and prefix as:
 51 | 
 52 | ````````
 53 |     process(s3, "commoncrawl", "crawl-data/CC-MAIN-2014-10", 2);
 54 | ````````
 55 | 
 56 | Note, using the Common Crawl AMI (I run it on a Medium EC2 instance), I installed JDK 1.7 (required for
 57 | the edu.cmu.lemurproject package):
 58 | 
 59 | ````````
 60 | sudo yum install java-1.7.0-openjdk-devel.x86_64
 61 | ````````
 62 | 
 63 | TODO: In addition to installing Java 7, you also need to configure it 
 64 | using 
 65 | 
 66 | `sudo alternatives --config javac`
 67 | `sudo alternatives --config java`
 68 | 
 69 | TODO: Maven needs to be installed and it's not available through yum 
 70 | without some gymnastics.
 71 | 
 72 | After cloning the Github repository to get these examples on an EC2 instance:
 73 | 
 74 | 
 75 | ```
 76 | git clone https://github.com/commoncrawl/example-warc-java.git
 77 | cd example-warc-java
 78 | ```
 79 | 
 80 | build and run using:
 81 | 
 82 | ````````
 83 | mvn install
 84 | mvn exec:java -Dexec.mainClass=org.commoncrawl.examples.java_warc.ReadS3Bucket
 85 | ````````
 86 | 
 87 | Note: I also tested this using a micro EC2 instance. The time to process two gzipped segment files
 88 | (of size a little less than 1 gigabyte each) is about 45 seconds on a micro EC2 instance.
 89 | 
 90 | ## Clojure Examples
 91 | 
 92 | You need to install the commoncrawl JAR file in your local maven repository:
 93 | 
 94 | ````````
 95 | mvn install:install-file -Durl=file:repo -DpomFile=pom.xml -DgroupId=local -DartifactId=commoncrawl -Dversion=0.0.1 -Dpackaging=jar -Dfile=target/commoncrawl-0.0.1.jar
 96 | ````````
 97 | 
 98 | Then  you can:
 99 | 
100 | ````````
101 | cd clojure-examples
102 | lein deps
103 | lein test
104 | ````````
105 | 
106 | ## License 
107 | 
108 | This code is licensed under the Apache 2 license.  Please give back to
109 | Common Crawl if you found it useful.
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/clojure-examples/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /lib
 3 | /classes
 4 | /checkouts
 5 | pom.xml
 6 | pom.xml.asc
 7 | *.jar
 8 | *.class
 9 | .lein-deps-sum
10 | .lein-failures
11 | .lein-plugins
12 | .lein-repl-history
13 | 


--------------------------------------------------------------------------------
/clojure-examples/README.md:
--------------------------------------------------------------------------------
 1 | # clojure-examples for reading Common Crawl WARC files
 2 | 
 3 | Note: this is still a work in progress, but if you use Clojure, please try this simple example and provide feedback.
 4 | 
 5 | First, build the parent Java project and make sure to install the generated maven project JAR file into
 6 | your local maven repository using:
 7 | 
 8 | ````````
 9 | mvn install:install-file -Durl=file:repo -DpomFile=pom.xml -DgroupId=local -DartifactId=commoncrawl -Dversion=0.0.1 -Dpackaging=jar -Dfile=target/commoncrawl-0.0.1.jar
10 | ````````
11 | 
12 | in the parent directory. Copy a single test WARC file from S3:
13 | 
14 | ````````
15 | aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00002-ip-10-33-133-15.ec2.internal.warc.gz .
16 | ````````
17 | 
18 | Then you can run 
19 | 
20 | ````````
21 | lein test
22 | ````````
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/clojure-examples/project.clj:
--------------------------------------------------------------------------------
1 | (defproject clojure-examples "0.1.0-SNAPSHOT"
2 |   :description "FIXME: write description"
3 |   :url "http://example.com/FIXME"
4 |   :license {:name "Eclipse Public License"
5 |             :url "http://www.eclipse.org/legal/epl-v10.html"}
6 |   :dependencies [[org.clojure/clojure "1.5.1"]
7 |                  [local/commoncrawl "0.0.1"]]
8 | )
9 | 


--------------------------------------------------------------------------------
/clojure-examples/src/clojure_examples/core.clj:
--------------------------------------------------------------------------------
 1 | (ns clojure-examples.core
 2 |   (:import [java.io DataInputStream FileInputStream])
 3 |   (:import [java.util.zip GZIPInputStream])
 4 |   (:import [edu.cmu.lemurproject WarcRecord WarcHTMLResponseRecord])
 5 |   )
 6 | 
 7 | 
 8 | (defn single-warc-file []
 9 |   (let [input-warc-file "CC-MAIN-20131204131715-00002-ip-10-33-133-15.ec2.internal.warc.gz"
10 |         gz-input-stream (GZIPInputStream. (FileInputStream. input-warc-file))
11 |         in-stream (DataInputStream. gz-input-stream)]
12 |     (defn read-warc-record []
13 |       (let [r (WarcRecord/readNextWarcRecord in-stream)]
14 |         (if (= (.getHeaderRecordType r) "response")
15 |           (let [html-record (WarcHTMLResponseRecord. r)
16 |                 uri (.getTargetURI html-record)
17 |                 content (.getContentUTF8 (.getRawRecord html-record))]
18 |             (println uri)
19 |             ;;(println content)
20 |             ))))
21 |     (dotimes [n 50] (read-warc-record))))
22 | 
23 | ;; (single-warc-file)


--------------------------------------------------------------------------------
/clojure-examples/test/clojure_examples/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns clojure-examples.core-test
2 |   (:use clojure.test
3 |         clojure-examples.core))
4 | 
5 | (single-warc-file)
6 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>org.commoncrawl</groupId>
 6 |     <artifactId>commoncrawl</artifactId>
 7 |     <version>0.0.1</version>
 8 |     <packaging>jar</packaging>
 9 | 
10 | 
11 |     <description>
12 |         Java examples for using Common Crawl WARC files from S3
13 |     </description>
14 | 
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>com.amazonaws</groupId>
18 |             <artifactId>aws-java-sdk</artifactId>
19 |             <version>1.0.002</version>
20 |         </dependency>
21 |         <dependency>
22 |             <groupId>commons-codec</groupId>
23 |             <artifactId>commons-codec</artifactId>
24 |             <version>1.4</version>
25 |         </dependency>
26 |         <dependency>
27 |             <groupId>commons-io</groupId>
28 |             <artifactId>commons-io</artifactId>
29 |             <version>2.4</version>
30 |         </dependency>
31 |         <dependency>
32 |             <groupId>commons-logging</groupId>
33 |             <artifactId>commons-logging</artifactId>
34 |             <version>1.1.3</version>
35 |         </dependency>
36 |         <dependency>
37 |             <groupId>com.facebook.hadoop</groupId>
38 |             <artifactId>hadoop-core</artifactId>
39 |             <version>0.20.0</version>
40 |         </dependency>
41 |         <dependency>
42 |             <groupId>commons-httpclient</groupId>
43 |             <artifactId>commons-httpclient</artifactId>
44 |             <version>3.1</version>
45 |         </dependency>
46 |         <dependency>
47 |             <groupId>org.apache.httpcomponents</groupId>
48 |             <artifactId>httpcore</artifactId>
49 |             <version>4.2.5</version>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.codehaus.jackson</groupId>
53 |             <artifactId>jackson-core-asl</artifactId>
54 |             <version>1.9.10</version>
55 |         </dependency>
56 |         <dependency>
57 |             <groupId>org.codehaus.jackson</groupId>
58 |             <artifactId>jackson-mapper-asl</artifactId>
59 |             <version>1.9.10</version>
60 |         </dependency>
61 | 
62 |     </dependencies>
63 | 
64 |     <build>
65 |         <plugins>
66 |             <plugin>
67 |                 <groupId>org.apache.maven.plugins</groupId>
68 |                 <artifactId>maven-compiler-plugin</artifactId>
69 |                 <version>3.1</version>
70 |                 <configuration>
71 |                     <source>1.7</source>
72 |                     <target>1.7</target>
73 |                 </configuration>
74 |             </plugin>
75 | 
76 |             <plugin>
77 |                 <groupId>org.codehaus.mojo</groupId>
78 |                 <artifactId>exec-maven-plugin</artifactId>
79 |                 <version>1.1.1</version>
80 |                 <executions>
81 |                     <execution>
82 |                         <phase>exec</phase>
83 |                         <goals>
84 |                             <goal>java</goal>
85 |                         </goals>
86 |                     </execution>
87 |                 </executions>
88 |             </plugin>
89 | 
90 |         </plugins>
91 |     </build>
92 | 
93 | </project>
94 | 


--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcFileInputFormat.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Hadoop FileInputFormat for reading WARC files
 3 |  *
 4 |  * (C) 2009 - Carnegie Mellon University
 5 |  *
 6 |  * 1. Redistributions of this source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",
 9 |  *    "Carnegie Mellon", and "lemurproject" must not be used to
10 |  *    endorse or promote products derived from this software without
11 |  *    prior written permission. To obtain permission, contact
12 |  *    license@lemurproject.org.
13 |  *
14 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
16 |  *    permission of The Lemur Project. To obtain permission,
17 |  *    contact license@lemurproject.org.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
21 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
23 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
24 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 |  * POSSIBILITY OF SUCH DAMAGE.
31 |  *
32 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 |  */
34 | 
35 | package edu.cmu.lemurproject;
36 | 
37 | import java.io.IOException;
38 | import org.apache.hadoop.fs.FileSystem;
39 | import org.apache.hadoop.fs.Path;
40 | import org.apache.hadoop.io.LongWritable;
41 | import org.apache.hadoop.mapred.FileInputFormat;
42 | import org.apache.hadoop.mapred.InputSplit;
43 | import org.apache.hadoop.mapred.JobConf;
44 | import org.apache.hadoop.mapred.RecordReader;
45 | import org.apache.hadoop.mapred.Reporter;
46 | 
47 | public class WarcFileInputFormat extends FileInputFormat<LongWritable, WritableWarcRecord> {
48 | 
49 |   /**
50 |    * Don't allow the files to be split!
51 |    */
52 |   @Override
53 |   protected boolean isSplitable(FileSystem fs, Path filename) {
54 |     // ensure the input files are not splittable!
55 |     return false;
56 |   }
57 | 
58 |   /**
59 |    * Just return the record reader
60 |    */
61 |   public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
62 |     return new WarcFileRecordReader(conf, split);
63 |   }
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcFileRecordReader.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A Hadoop record reader for reading Warc Records
  3 |  *
  4 |  * (C) 2009 - Carnegie Mellon University
  5 |  *
  6 |  * 1. Redistributions of this source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",
  9 |  *    "Carnegie Mellon", and "lemurproject" must not be used to
 10 |  *    endorse or promote products derived from this software without
 11 |  *    prior written permission. To obtain permission, contact
 12 |  *    license@lemurproject.org.
 13 |  *
 14 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
 15 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
 16 |  *    permission of The Lemur Project. To obtain permission,
 17 |  *    contact license@lemurproject.org.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 20 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 21 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 22 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 23 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 24 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 26 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 27 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 28 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 29 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 30 |  * POSSIBILITY OF SUCH DAMAGE.
 31 |  *
 32 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 33 |  */
 34 | 
 35 | package edu.cmu.lemurproject;
 36 | 
 37 | import edu.cmu.lemurproject.WarcRecord;
 38 | import java.io.DataInputStream;
 39 | import java.io.IOException;
 40 | import org.apache.commons.logging.Log;
 41 | import org.apache.commons.logging.LogFactory;
 42 | import org.apache.hadoop.conf.Configuration;
 43 | import org.apache.hadoop.fs.FSDataInputStream;
 44 | import org.apache.hadoop.fs.FileSystem;
 45 | import org.apache.hadoop.fs.Path;
 46 | import org.apache.hadoop.io.LongWritable;
 47 | import org.apache.hadoop.io.Writable;
 48 | import org.apache.hadoop.io.WritableComparable;
 49 | import org.apache.hadoop.io.compress.CompressionCodec;
 50 | import org.apache.hadoop.mapred.FileSplit;
 51 | import org.apache.hadoop.mapred.InputSplit;
 52 | import org.apache.hadoop.mapred.MultiFileSplit;
 53 | import org.apache.hadoop.mapred.RecordReader;
 54 | import org.apache.hadoop.util.ReflectionUtils;
 55 | 
 56 | public class WarcFileRecordReader<K extends WritableComparable, V extends Writable>  implements RecordReader<LongWritable, WritableWarcRecord> {
 57 |   public static final Log LOG = LogFactory.getLog(WarcFileRecordReader.class);
 58 | 
 59 |   private long recordNumber=1;
 60 | 
 61 |   private Path[] filePathList=null;
 62 |   private int currentFilePath=-1;
 63 | 
 64 |   private FSDataInputStream currentFile=null;
 65 |   private CompressionCodec compressionCodec=null;
 66 |   private DataInputStream compressionInput=null;
 67 |   private FileSystem fs=null;
 68 |   private long totalFileSize=0;
 69 |   private long totalNumBytesRead=0;
 70 | 
 71 |   public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
 72 |     if (split instanceof FileSplit) {
 73 |       this.filePathList=new Path[1];
 74 |       this.filePathList[0]=((FileSplit)split).getPath();
 75 |     } else if (split instanceof MultiFileSplit) {
 76 |       this.filePathList=((MultiFileSplit)split).getPaths();
 77 |     } else {
 78 |       throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
 79 |     }
 80 |     
 81 |     fs = this.filePathList[0].getFileSystem(conf);
 82 |     
 83 |     // get the total file sizes
 84 |     for (int i=0; i < filePathList.length; i++) {
 85 |       totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
 86 |     }
 87 | 
 88 |     Class<? extends CompressionCodec> codecClass=null;
 89 | 
 90 |     try {
 91 |       codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
 92 |       compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
 93 |     } catch (ClassNotFoundException cnfEx) {
 94 |       compressionCodec=null;
 95 |       LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
 96 |     }
 97 | 
 98 |     openNextFile();
 99 |   }
100 | 
101 |   private boolean openNextFile() {
102 |     try {
103 |       if (compressionInput!=null) {
104 |         compressionInput.close();
105 |       } else if (currentFile!=null) {
106 |         currentFile.close();
107 |       }
108 |       currentFile=null;
109 |       compressionInput=null;
110 | 
111 |       currentFilePath++;
112 |       if (currentFilePath >= filePathList.length) { return false; }
113 | 
114 |       currentFile=this.fs.open(filePathList[currentFilePath]);
115 | 
116 |       // is the file gzipped?
117 |       if ((compressionCodec!=null) && (filePathList[currentFilePath].getName().endsWith("gz"))) {
118 |         compressionInput=new DataInputStream(compressionCodec.createInputStream(currentFile));
119 |         LOG.info("Compression enabled");
120 |       }
121 | 
122 |     } catch (IOException ex) {
123 |       LOG.info("IOError opening " + filePathList[currentFilePath].toString() + " - message: " + ex.getMessage());
124 |       return false;
125 |     }
126 |     return true;
127 |   }
128 | 
129 |   public boolean next(LongWritable key, WritableWarcRecord value) throws IOException {
130 |     DataInputStream whichStream=null;
131 |     if (compressionInput!=null) {
132 |       whichStream=compressionInput;
133 |     } else if (currentFile!=null) {
134 |       whichStream=currentFile;
135 |     }
136 | 
137 |     if (whichStream==null) { return false; }
138 | 
139 |     WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream);
140 |     if (newRecord==null) {
141 |       // try advancing the file
142 |       if (openNextFile()) {
143 |         newRecord=WarcRecord.readNextWarcRecord(whichStream);
144 |       }
145 | 
146 |       if (newRecord==null) { return false; }
147 |     }
148 | 
149 |     totalNumBytesRead += (long)newRecord.getTotalRecordLength();
150 |     newRecord.setWarcFilePath(filePathList[currentFilePath].toString());
151 | 
152 |     // now, set our output variables
153 |     value.setRecord(newRecord);
154 |     key.set(recordNumber);
155 | 
156 |     recordNumber++;
157 |     return true;
158 |   }
159 | 
160 |   public LongWritable createKey() {
161 |     return new LongWritable();
162 |   }
163 | 
164 |   public WritableWarcRecord createValue() {
165 |     return new WritableWarcRecord();
166 |   }
167 | 
168 |   public long getPos() throws IOException {
169 |     return totalNumBytesRead;
170 |   }
171 | 
172 |   public void close() throws IOException {
173 |     totalNumBytesRead=totalFileSize;
174 |     if (compressionInput!=null) {
175 |       compressionInput.close();
176 |     } else if (currentFile!=null) {
177 |       currentFile.close();
178 |     }
179 |   }
180 | 
181 |   public float getProgress() throws IOException {
182 |     if (compressionInput!=null) {
183 |       if (filePathList.length==0) { return 1.0f; }
184 |       // return which file - can't do extact byte matching
185 |       return (float)currentFilePath / (float)(filePathList.length);
186 |     }
187 |     if (totalFileSize==0) { return 0.0f; }
188 |     return (float)totalNumBytesRead/(float)totalFileSize;
189 |   }
190 | 
191 | }
192 | 


--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcHTMLResponseRecord.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Container for a Warc Record of type "response"
  3 |  * 
  4 |  * (C) 2009 - Carnegie Mellon University
  5 |  * 
  6 |  * 1. Redistributions of this source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer. 
  8 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",  
  9 |  *    "Carnegie Mellon", and "lemurproject" must not be used to 
 10 |  *    endorse or promote products derived from this software without
 11 |  *    prior written permission. To obtain permission, contact 
 12 |  *    license@lemurproject.org.
 13 |  *
 14 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
 15 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
 16 |  *    permission of The Lemur Project. To obtain permission,
 17 |  *    contact license@lemurproject.org.
 18 |  * 
 19 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 20 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
 21 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
 22 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 
 23 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 
 24 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 25 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
 26 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 27 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 28 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
 29 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 30 |  * POSSIBILITY OF SUCH DAMAGE. 
 31 |  * 
 32 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 33 |  */
 34 | 
 35 | package edu.cmu.lemurproject;
 36 | 
 37 | import java.io.BufferedReader;
 38 | import java.io.ByteArrayInputStream;
 39 | import java.io.IOException;
 40 | import java.io.InputStreamReader;
 41 | import java.net.URISyntaxException;
 42 | import java.util.HashSet;
 43 | import java.util.Iterator;
 44 | import java.util.Vector;
 45 | import java.util.regex.Matcher;
 46 | import java.util.regex.Pattern;
 47 | 
 48 | public class WarcHTMLResponseRecord {
 49 |   
 50 |   private WarcRecord warcRecord=new WarcRecord();
 51 |   
 52 |   private static String SINGLE_SPACE=" ";
 53 |   
 54 |   private static Pattern ALL_HTML_TAGS=Pattern.compile("<(.*?)>");
 55 |   private static Pattern A_HREF_PATTERN=Pattern.compile("[aA].+?[hH][rR][eE][fF]=['\"](.+?)['\"].*?");
 56 |   private static Pattern AREA_HREF_PATTERN=Pattern.compile("[aA][rR][eE][aA].+?[hH][rR][eE][fF]=['\"](.*?)['\"].*?");
 57 |   private static Pattern FRAME_SRC_PATTERN=Pattern.compile("[fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
 58 |   private static Pattern IFRAME_SRC_PATTERN=Pattern.compile("[iI][fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
 59 |   private static Pattern HTTP_START_PATTERN=Pattern.compile("^[hH][tT][tT][pP][sS]?://.*");
 60 | 
 61 |   // create our pattern set
 62 |   private Vector<Pattern> patternSet=new Vector<Pattern>();
 63 | 
 64 |   /**
 65 |    * Default constructor
 66 |    */
 67 |   public WarcHTMLResponseRecord() {
 68 |     createPatternSet();
 69 |   }
 70 |   
 71 |   /**
 72 |    * Copy constructor
 73 |    * @param o
 74 |    */
 75 |   public WarcHTMLResponseRecord(WarcHTMLResponseRecord o) {
 76 |     this.warcRecord.set(o.warcRecord);
 77 |     createPatternSet();
 78 |   }
 79 |   
 80 |   /**
 81 |    * Constructor creation from a generic WARC record
 82 |    * @param o
 83 |    */
 84 |   public WarcHTMLResponseRecord(WarcRecord o) {
 85 |     if (o.getHeaderRecordType().compareToIgnoreCase("response")==0) {
 86 |       this.warcRecord.set(o);
 87 |     }
 88 |     createPatternSet();
 89 |   }
 90 |   
 91 |   private void createPatternSet() {
 92 |     patternSet.add(A_HREF_PATTERN);
 93 |     patternSet.add(AREA_HREF_PATTERN);
 94 |     patternSet.add(FRAME_SRC_PATTERN);
 95 |     patternSet.add(IFRAME_SRC_PATTERN);
 96 |   }
 97 |   
 98 |   public void setRecord(WarcRecord o) {
 99 |     if (o.getHeaderRecordType().compareToIgnoreCase("response")==0) {
100 |       this.warcRecord.set(o);
101 |     }
102 |   }
103 |   
104 |   public WarcRecord getRawRecord() {
105 |     return warcRecord;
106 |   }
107 |   
108 |   public String getTargetURI() {
109 |     return warcRecord.getHeaderMetadataItem("WARC-Target-URI");
110 |   }
111 |   
112 |   public String getTargetTrecID() {
113 |     return warcRecord.getHeaderMetadataItem("WARC-TREC-ID");
114 |   }
115 | 
116 |   private String getNormalizedContentURL(String pageURL, String contentURL) {
117 |     String fixedContentURL = contentURL;
118 |     try {
119 |       // resolve any potentially relative paths to the full URL based on the page
120 |       java.net.URI baseURI = new java.net.URI(pageURL);
121 |       // ensure that the content doesn't have query parameters - if so, strip them
122 |       int contentParamIndex = contentURL.indexOf("?");
123 |       if (contentParamIndex > 0) {
124 |         fixedContentURL = contentURL.substring(0, contentParamIndex);
125 |       }
126 |       java.net.URI resolvedURI = baseURI.resolve(fixedContentURL);
127 |       return resolvedURI.toString();
128 |     } catch (URISyntaxException ex) {
129 |     } catch (java.lang.IllegalArgumentException iaEx) {
130 |       return fixedContentURL;
131 |     } catch (Exception gEx) {
132 |     }
133 |     return "";
134 |   }
135 |   
136 |   private HashSet<String> getMatchesOutputSet(Vector<String> tagSet, String baseURL) {
137 |     HashSet<String> retSet=new HashSet<String>();
138 |     
139 |     Iterator<String> vIter=tagSet.iterator();
140 |     while (vIter.hasNext()) {
141 |       String thisCheckPiece=vIter.next();
142 |       Iterator<Pattern> pIter=patternSet.iterator();
143 |       boolean hasAdded=false;
144 |       while (!hasAdded && pIter.hasNext()) {
145 |         Pattern thisPattern=pIter.next();
146 |         Matcher matcher=thisPattern.matcher(thisCheckPiece);
147 |         if (matcher.find() && (matcher.groupCount() > 0)) {
148 |           String thisMatch=getNormalizedContentURL(baseURL, matcher.group(1));
149 |           if (HTTP_START_PATTERN.matcher(thisMatch).matches()) {
150 |             if (!retSet.contains(thisMatch) && !baseURL.equals(thisMatch)) {
151 |               retSet.add(thisMatch);
152 |               hasAdded=true;
153 |             } // end if (!retSet.contains(thisMatch))
154 |           } // end if (HTTP_START_PATTERN.matcher(thisMatch).matches())
155 |         } // end if (matcher.find() && (matcher.groupCount() > 0))
156 |         matcher.reset();
157 |       } // end while (!hasAdded && pIter.hasNext())
158 |     } // end while (vIter.hasNext())
159 |     
160 |     return retSet;
161 |   }
162 |   
163 |   /**
164 |    * Gets a vector of normalized URLs (normalized to this target URI)
165 |    * of the outlinks of the page
166 |    * @return
167 |    */
168 |   public Vector<String> getURLOutlinks() {
169 |     Vector<String> retVec = new Vector<String>();
170 | 
171 |     String baseURL = getTargetURI();
172 |     if ((baseURL == null) || (baseURL.length() == 0)) {
173 |       return retVec;
174 |     }
175 |     
176 |     byte[] contentBytes=warcRecord.getContent();
177 |     
178 |     ByteArrayInputStream contentStream=new ByteArrayInputStream(contentBytes);
179 |     BufferedReader inReader=new BufferedReader(new InputStreamReader(contentStream));
180 | 
181 |     // forward to the first \n\n
182 |     try {
183 |       boolean inHeader=true;
184 |       String line=null;
185 |       while (inHeader && ((line=inReader.readLine())!=null)) {
186 |         if (line.trim().length()==0) {
187 |           inHeader=false;
188 |         }
189 |       }
190 |       
191 |       // now we have the rest of the lines
192 |       // read them all into a string buffer
193 |       // to remove all new lines
194 |       Vector<String> htmlTags=new Vector<String>();
195 |       while ((line=inReader.readLine())!=null) {
196 |         // get all HTML tags from the line...
197 |         Matcher HTMLMatcher=ALL_HTML_TAGS.matcher(line);
198 |         while (HTMLMatcher.find()) {
199 |           htmlTags.add(HTMLMatcher.group(1));
200 |         }
201 |       }
202 |       
203 |       HashSet<String> retSet=getMatchesOutputSet(htmlTags, baseURL);
204 |       
205 |       Iterator<String> oIter=retSet.iterator();
206 |       while (oIter.hasNext()) {
207 |         String thisValue=oIter.next();
208 |         if (!thisValue.equals(baseURL)) {
209 |           retVec.add(thisValue);
210 |         }
211 |       }
212 |       
213 |     } catch (IOException ioEx) {
214 |       retVec.clear();
215 |     }
216 | 
217 |     return retVec;
218 |   }
219 |   
220 | }
221 | 


--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcRecord.java:
--------------------------------------------------------------------------------
  1 | /*
  2 | Lemur License Agreement
  3 | 
  4 |   Copyright (c) 2000-2011 The Lemur Project.  All rights reserved.
  5 | 
  6 |   Redistribution and use in source and binary forms, with or without
  7 |   modification, are permitted provided that the following conditions
  8 |   are met:
  9 | 
 10 |   1. Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 | 
 13 |   2. Redistributions in binary form must reproduce the above copyright
 14 |      notice, this list of conditions and the following disclaimer in
 15 |      the documentation and/or other materials provided with the
 16 |      distribution.
 17 | 
 18 |   3. The names "Lemur", "Indri", "University of Massachusetts" and
 19 |      "Carnegie Mellon" must not be used to endorse or promote products
 20 |      derived from this software without prior written permission. To
 21 |      obtain permission, contact license@lemurproject.org
 22 | 
 23 |   4. Products derived from this software may not be called "Lemur" or "Indri"
 24 |      nor may "Lemur" or "Indri" appear in their names without prior written
 25 |      permission of The Lemur Project. To obtain permission,
 26 |      contact license@lemurproject.org.
 27 | 
 28 |   THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AND OTHER
 29 |   CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
 30 |   BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 31 |   FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 32 |   COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 33 |   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 34 |   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 35 |   OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 36 |   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 37 |   TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 38 |   USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 39 |   DAMAGE.
 40 | 
 41 |  */
 42 | /*
 43 |  * To change this template, choose Tools | Templates
 44 |  * and open the template in the editor.
 45 |  */
 46 | package edu.cmu.lemurproject;
 47 | 
 48 | import java.io.DataInput;
 49 | import java.io.DataInputStream;
 50 | import java.io.DataOutput;
 51 | import java.io.EOFException;
 52 | import java.io.IOException;
 53 | import java.io.UnsupportedEncodingException;
 54 | import java.util.HashMap;
 55 | import java.util.Iterator;
 56 | import java.util.Map.Entry;
 57 | import java.util.Set;
 58 | // import org.apache.commons.logging.Log;
 59 | // import org.apache.commons.logging.LogFactory;
 60 | 
 61 | /**
 62 |  *
 63 |  * @author mhoy
 64 |  */
 65 | public class WarcRecord {
 66 | 
 67 |   // public static final Log LOG = LogFactory.getLog(WarcRecord.class);
 68 |   
 69 |   public static String WARC_VERSION = "WARC/";
 70 |   public static String WARC_VERSION_LINE = "WARC/0.18\n";
 71 | 
 72 |   ////public static String WARC_VERSION = "WARC/1.0";
 73 |   //public static String WARC_VERSION = "WARC/0.18";
 74 |   ////public static String WARC_VERSION_LINE = "WARC/1.0\n";
 75 |   //public static String WARC_VERSION_LINE = "WARC/0.18\n";
 76 |   private static String NEWLINE="\n";
 77 |   private static String CR_NEWLINE="\r\n";
 78 |   
 79 |   private static byte MASK_THREE_BYTE_CHAR=(byte)(0xE0);
 80 |   private static byte MASK_TWO_BYTE_CHAR=(byte)(0xC0);
 81 |   private static byte MASK_TOPMOST_BIT=(byte)(0x80);
 82 |   private static byte MASK_BOTTOM_SIX_BITS=(byte)(0x1F);
 83 |   private static byte MASK_BOTTOM_FIVE_BITS=(byte)(0x3F);
 84 |   private static byte MASK_BOTTOM_FOUR_BITS=(byte)(0x0F);
 85 |   
 86 |   private static String LINE_ENDING="\n";
 87 |   
 88 |   private static String readLineFromInputStream(DataInputStream in) throws IOException {
 89 |     StringBuilder retString=new StringBuilder();
 90 |     boolean found_cr = false;
 91 |     boolean keepReading=true;
 92 |     try {
 93 |       do {
 94 |         char thisChar=0;
 95 |         byte readByte=in.readByte();
 96 |         // check to see if it's a multibyte character
 97 |         if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
 98 | 	    found_cr = false;
 99 |           // need to read the next 2 bytes
100 |           if (in.available() < 2) {
101 |             // treat these all as individual characters
102 |             retString.append((char)readByte);
103 |             int numAvailable=in.available();
104 |             for (int i=0; i < numAvailable; i++) {
105 |               retString.append((char)(in.readByte()));
106 |             }
107 |             continue;
108 |           }
109 |           byte secondByte=in.readByte();
110 |           byte thirdByte=in.readByte();
111 |           // ensure the topmost bit is set
112 |           if (((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT)) {
113 |             //treat these as individual characters
114 |             retString.append((char)readByte);
115 |             retString.append((char)secondByte);
116 |             retString.append((char)thirdByte);
117 |             continue;
118 |           }
119 |           int finalVal=(thirdByte & MASK_BOTTOM_FIVE_BITS) + 64*(secondByte & MASK_BOTTOM_FIVE_BITS) + 4096*(readByte & MASK_BOTTOM_FOUR_BITS);
120 |           thisChar=(char)finalVal;
121 |         } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
122 | 	    found_cr = false;
123 | 
124 |           // need to read next byte
125 |           if (in.available() < 1) {
126 |             // treat this as individual characters
127 |             retString.append((char)readByte);
128 |             continue;
129 |           }
130 |           byte secondByte=in.readByte();
131 |           if ((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) {
132 |             retString.append((char)readByte);
133 |             retString.append((char)secondByte);
134 |             continue;
135 |           }
136 |           int finalVal=(secondByte & MASK_BOTTOM_FIVE_BITS) + 64*(readByte & MASK_BOTTOM_SIX_BITS);
137 |           thisChar=(char)finalVal;
138 |         } else {
139 |           // interpret it as a single byte
140 |           thisChar=(char)readByte;
141 |         }
142 | 	// Look for carriage return; if found set a flag
143 | 	if (thisChar=='\r') {
144 | 	    found_cr = true;
145 | 	}
146 | 	if (thisChar=='\n') {
147 | 	    // if the linefeed is the next character after the carriage return
148 | 	    if (found_cr) {
149 | 		LINE_ENDING = CR_NEWLINE;
150 | 	    } else {
151 | 		LINE_ENDING = NEWLINE;
152 | 	    } 
153 | 	    keepReading=false;
154 |         } else {
155 |           retString.append(thisChar);
156 |         }
157 |       } while (keepReading);
158 |     } catch (EOFException eofEx) {
159 |       return null;
160 |     }
161 |     
162 |     if (retString.length()==0) { 
163 |       return "";
164 |     }
165 |     
166 |     return retString.toString();
167 |   }
168 |   
169 |   private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer) throws IOException {
170 |     if (in==null) { return null; }
171 |     if (headerBuffer==null) { return null; }
172 | 
173 |     String line=null;
174 |     boolean foundMark=false;    
175 |     byte[] retContent=null;
176 | 
177 |     // cannot be using a buffered reader here!!!!
178 |     // just read the header
179 |     // first - find our WARC header
180 |     while ((!foundMark) && ((line=readLineFromInputStream(in))!=null)) {
181 |       if (line.startsWith(WARC_VERSION)) {
182 |         WARC_VERSION_LINE = line;
183 |         foundMark=true;
184 |       }
185 |     }
186 | 
187 |     // no WARC mark?
188 |     if (!foundMark) { return null; }
189 |     
190 |     // LOG.info("Found WARC_VERSION");
191 | 
192 |     int contentLength = -1;
193 |     // read until we see contentLength then an empty line
194 |     // (to handle malformed ClueWeb09 headers that have blank lines)
195 |     // get the content length and set our retContent
196 |     for (line = readLineFromInputStream(in).trim(); 
197 |         line.length() > 0 || contentLength < 0; 
198 |         line = readLineFromInputStream(in).trim()) {
199 |       
200 |       if (line.length() > 0 ) {
201 |         headerBuffer.append(line);
202 |         headerBuffer.append(LINE_ENDING);
203 |         
204 |         // find the content length designated by Content-Length: <length>
205 |         String[] parts = line.split(":", 2);
206 |         if (parts.length == 2 && parts[0].equals("Content-Length")) {
207 |           try {
208 |             contentLength=Integer.parseInt(parts[1].trim());
209 |             // LOG.info("WARC record content length: " + contentLength);
210 |           } catch (NumberFormatException nfEx) {
211 |             contentLength=-1;
212 |           }
213 |         }
214 |       }
215 |     }
216 |     
217 |     // now read the bytes of the content
218 |     retContent=new byte[contentLength];
219 |     int totalWant=contentLength;
220 |     int totalRead=0;
221 |     //
222 |     // LOOP TO REMOVE LEADING CR * LF 
223 |     // To prevent last few characters from being cut off of the content
224 |     // when reading
225 |     //
226 |     while ((totalRead == 0) && (totalRead < contentLength)) {
227 |       byte CR = in.readByte();
228 |       byte LF = in.readByte();
229 |       if ((CR != 13) && (LF != 10)) {
230 |         retContent[0] = CR;
231 |         retContent[1] = LF;
232 |         totalRead = 2;
233 |         totalWant = contentLength - totalRead;
234 |       }
235 |     }
236 |     //
237 |     //
238 |     //
239 |     while (totalRead < contentLength) {
240 |        try {
241 |         int numRead=in.read(retContent, totalRead, totalWant);
242 |         if (numRead < 0) {
243 |           return null;
244 |         } else {
245 |           totalRead += numRead;
246 |           totalWant = contentLength-totalRead;
247 |         } // end if (numRead < 0) / else
248 |       } catch (EOFException eofEx) {
249 |         // resize to what we have
250 |         if (totalRead > 0) {
251 |           byte[] newReturn=new byte[totalRead];
252 |           System.arraycopy(retContent, 0, newReturn, 0, totalRead);
253 |           return newReturn;
254 |         } else {
255 |           return null;
256 |         }
257 |        } // end try/catch (EOFException)
258 |     } // end while (totalRead < contentLength)
259 | 
260 |     return retContent;
261 |   }
262 |   
263 |   public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
264 |     // LOG.info("Starting read of WARC record");
265 |     StringBuffer recordHeader=new StringBuffer();
266 |     byte[] recordContent=readNextRecord(in, recordHeader);
267 |     if (recordContent==null) { 
268 |       // LOG.info("WARC content is null - file is complete");
269 |       return null; 
270 |     }
271 |     
272 |     // extract out our header information
273 |     String thisHeaderString=recordHeader.toString();
274 | 
275 | 
276 |     String[] headerLines=thisHeaderString.split(LINE_ENDING);
277 | 
278 |     WarcRecord retRecord=new WarcRecord();
279 |     for (int i=0; i < headerLines.length; i++) {
280 |       String[] pieces=headerLines[i].split(":", 2);
281 |       if (pieces.length!=2) { 
282 |         retRecord.addHeaderMetadata(pieces[0], "");
283 |         continue; 
284 |       }
285 |       String thisKey=pieces[0].trim();
286 |       String thisValue=pieces[1].trim();
287 | 
288 |       // check for known keys
289 |       if (thisKey.equals("WARC-Type")) { 
290 |         // LOG.info("Setting WARC record type: " + thisValue);
291 |         retRecord.setWarcRecordType(thisValue);
292 |       } else if (thisKey.equals("WARC-Date")) {
293 |         retRecord.setWarcDate(thisValue);
294 |       } else if (thisKey.equals("WARC-Record-ID")) {
295 |         // LOG.info("Setting WARC record ID: " + thisValue);
296 |         retRecord.setWarcUUID(thisValue);
297 |       } else if (thisKey.equals("Content-Type")) {
298 |         retRecord.setWarcContentType(thisValue);
299 |       } else {
300 |         retRecord.addHeaderMetadata(thisKey, thisValue);
301 |       }
302 |     }
303 | 
304 |     // set the content
305 |     retRecord.setContent(recordContent);
306 |     
307 |     return retRecord;
308 |   }
309 |   
310 |   public class WarcHeader {
311 |     public String contentType="";
312 |     public String UUID="";
313 |     public String dateString="";
314 |     public String recordType="";
315 |     public HashMap<String, String> metadata=new HashMap<String, String>();
316 |     public int contentLength=0;
317 |     
318 |     public WarcHeader() {
319 |     }
320 |     
321 |     public WarcHeader(WarcHeader o) {
322 |       this.contentType=o.contentType;
323 |       this.UUID=o.UUID;
324 |       this.dateString=o.dateString;
325 |       this.recordType=o.recordType;
326 |       this.metadata.putAll(o.metadata);
327 |       this.contentLength=o.contentLength;
328 |     }
329 |     
330 |     public void write(DataOutput out) throws IOException {
331 |       out.writeUTF(contentType);
332 |       out.writeUTF(UUID);
333 |       out.writeUTF(dateString);
334 |       out.writeUTF(recordType);
335 |       out.writeInt(metadata.size());
336 |       Iterator<Entry<String,String>> metadataIterator=metadata.entrySet().iterator();
337 |       while (metadataIterator.hasNext()) {
338 |         Entry<String,String> thisEntry=metadataIterator.next();
339 |         out.writeUTF(thisEntry.getKey());
340 |         out.writeUTF(thisEntry.getValue());
341 |       }
342 |       out.writeInt(contentLength);
343 |     }
344 |     
345 |     public void readFields(DataInput in) throws IOException {
346 |       contentType=in.readUTF();
347 |       UUID=in.readUTF();
348 |       dateString=in.readUTF();
349 |       recordType=in.readUTF();
350 |       metadata.clear();
351 |       int numMetaItems=in.readInt();
352 |       for (int i=0; i < numMetaItems; i++) {
353 |         String thisKey=in.readUTF();
354 |         String thisValue=in.readUTF();
355 |         metadata.put(thisKey, thisValue);
356 |       }
357 |       contentLength=in.readInt();
358 |     }
359 |     
360 |     @Override
361 |     public String toString() {
362 |       StringBuffer retBuffer=new StringBuffer();
363 |       
364 |       retBuffer.append(WARC_VERSION_LINE);
365 |       retBuffer.append(LINE_ENDING);
366 |       
367 |       retBuffer.append("WARC-Type: " + recordType + LINE_ENDING);
368 |       retBuffer.append("WARC-Date: " + dateString + LINE_ENDING);
369 |       
370 |       Iterator<Entry<String,String>> metadataIterator=metadata.entrySet().iterator();
371 |       while (metadataIterator.hasNext()) {
372 |         Entry<String,String> thisEntry=metadataIterator.next();
373 |         retBuffer.append(thisEntry.getKey());
374 |         retBuffer.append(": ");
375 |         retBuffer.append(thisEntry.getValue());
376 |         retBuffer.append(LINE_ENDING);
377 |       }
378 |       // Keep this as the last WARC-...
379 |       retBuffer.append("WARC-Record-ID: " + UUID + LINE_ENDING);
380 |       
381 |       retBuffer.append("Content-Type: " + contentType + LINE_ENDING);
382 |       retBuffer.append("Content-Length: " + contentLength + LINE_ENDING);
383 |       
384 |       return retBuffer.toString();
385 |     }
386 |   }
387 | 
388 |   private WarcHeader warcHeader=new WarcHeader();
389 |   private byte[] warcContent=null;
390 |   private String warcFilePath="";
391 |   
392 |   public WarcRecord() {
393 |     
394 |   }
395 |   
396 |   public WarcRecord(WarcRecord o) {
397 |     this.warcHeader=new WarcHeader(o.warcHeader);
398 |     this.warcContent=o.warcContent;
399 |   }
400 |   
401 |   public int getTotalRecordLength() {
402 |     int headerLength=warcHeader.toString().length();
403 |     return (headerLength + warcContent.length);
404 |   }
405 |   
406 |   public void set(WarcRecord o) {
407 |     this.warcHeader=new WarcHeader(o.warcHeader);
408 |     this.warcContent=o.warcContent;
409 |   }
410 |   
411 |   public String getWarcFilePath() {
412 |     return warcFilePath;
413 |   }
414 |   
415 |   public void setWarcFilePath(String path) {
416 |     warcFilePath=path;
417 |   }
418 |   
419 |   public void setWarcRecordType(String recordType) {
420 |     warcHeader.recordType=recordType;
421 |   }
422 |   
423 |   public void setWarcContentType(String contentType) {
424 |     warcHeader.contentType=contentType;
425 |   }
426 |   
427 |   public void setWarcDate(String dateString) {
428 |     warcHeader.dateString=dateString;
429 |   }
430 |   
431 |   public void setWarcUUID(String UUID) {
432 |     warcHeader.UUID=UUID;
433 |   }
434 |   
435 |   public void addHeaderMetadata(String key, String value) {
436 |     //System.out.println("+-- WarRecord.addHeaderMetadata key=" + key + " value=" + value);
437 |     // don't allow addition of known keys
438 |     if (key.equals("WARC-Type")) { return; }
439 |     if (key.equals("WARC-Date")) { return; }
440 |     if (key.equals("WARC-Record-ID")) { return; }
441 |     if (key.equals("Content-Type")) { return; }
442 |     if (key.equals("Content-Length")) { return; }
443 |     
444 |     warcHeader.metadata.put(key, value);
445 |   }
446 | 
447 |   
448 |   public void clearHeaderMetadata() {
449 |     warcHeader.metadata.clear();
450 |   }
451 |   
452 |   public Set<Entry<String,String>> getHeaderMetadata() {
453 |     return warcHeader.metadata.entrySet();
454 |   }
455 |   
456 |   public String getHeaderMetadataItem(String key) {
457 |     //System.out.println("+++ WarRecord.getHeaderMetadataItem key=" + key);  // WARC-Target-URI
458 |     if (key.equals("WARC-Type")) { return warcHeader.recordType; }
459 |     if (key.equals("WARC-Date")) { return warcHeader.dateString; }
460 |     if (key.equals("WARC-Record-ID")) { return warcHeader.UUID; }
461 |     if (key.equals("Content-Type")) { return warcHeader.contentType; }
462 |     if (key.equals("Content-Length")) { return Integer.toString(warcHeader.contentLength); }
463 | 
464 |     return warcHeader.metadata.get(key);
465 |   }
466 |   
467 |   public void setContent(byte[] content) {
468 |     warcContent=content;
469 |     warcHeader.contentLength=content.length;
470 |   }
471 |   
472 |   public void setContent(String content) {
473 |     setContent(content.getBytes());
474 |   }
475 |     public void setContentLength(int len) {
476 |         warcHeader.contentLength=len;
477 |   }
478 |   
479 |   public byte[] getContent() {
480 |     return warcContent;
481 |   }
482 |   public byte[] getByteContent() {
483 |     return warcContent;
484 |   }
485 |  
486 |   public String getContentUTF8() {
487 |     String retString=null;
488 |     try {
489 |       retString = new String(warcContent, "UTF-8");
490 |     } catch (UnsupportedEncodingException ex) {
491 |       retString=new String(warcContent);
492 |     }
493 |     return retString;
494 |   }
495 |   
496 |   public String getHeaderRecordType() {
497 |     return warcHeader.recordType;
498 |   }
499 |   
500 |   @Override
501 |   public String toString() {
502 |     StringBuffer retBuffer=new StringBuffer();
503 |     retBuffer.append(warcHeader.toString());
504 |     retBuffer.append(LINE_ENDING);
505 |     retBuffer.append(new String(warcContent));
506 |     return retBuffer.toString();
507 |   }
508 | 
509 |   public String getHeaderString() {
510 |     return warcHeader.toString();
511 |   }
512 | 
513 |   public void write(DataOutput out) throws IOException {
514 |     warcHeader.write(out);
515 |     out.write(warcContent);
516 |   }
517 |   
518 |   public void readFields(DataInput in) throws IOException {
519 |     warcHeader.readFields(in);
520 |     int contentLengthBytes=warcHeader.contentLength;
521 |     warcContent=new byte[contentLengthBytes];
522 |     in.readFully(warcContent);
523 |   }
524 |   
525 | }
526 | 
527 | 


--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WritableWarcRecord.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * An extension of the Writable object for Hadoop for a Warc Record
 3 |  * 
 4 |  * (C) 2009 - Carnegie Mellon University
 5 |  * 
 6 |  * 1. Redistributions of this source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer. 
 8 |  * 2. The names "Lemur", "Indri", "University of Massachusetts",  
 9 |  *    "Carnegie Mellon", and "lemurproject" must not be used to 
10 |  *    endorse or promote products derived from this software without
11 |  *    prior written permission. To obtain permission, contact 
12 |  *    license@lemurproject.org.
13 |  *
14 |  * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 |  *    nor may "Lemur" or "Indri" appear in their names without prior written
16 |  *    permission of The Lemur Project. To obtain permission,
17 |  *    contact license@lemurproject.org.
18 |  * 
19 |  * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 |  * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
21 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
22 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 
23 |  * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 
24 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
25 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
26 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
27 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
28 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
29 |  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
30 |  * POSSIBILITY OF SUCH DAMAGE. 
31 |  * 
32 |  * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 |  */
34 | 
35 | package edu.cmu.lemurproject;
36 | 
37 | import edu.cmu.lemurproject.WarcRecord;
38 | import java.io.DataInput;
39 | import java.io.DataOutput;
40 | import java.io.IOException;
41 | import org.apache.hadoop.io.Writable;
42 | 
43 | public class WritableWarcRecord implements Writable {
44 |   
45 |   WarcRecord record=null;
46 |   
47 |   public WritableWarcRecord() {
48 |     record=new WarcRecord();
49 |   }
50 |   
51 |   public WritableWarcRecord(WarcRecord o) {
52 |     record=new WarcRecord(o);
53 |   }
54 |   
55 |   public WarcRecord getRecord() {
56 |     return record;
57 |   }
58 |   
59 |   public void setRecord(WarcRecord rec) {
60 |     record=new WarcRecord(rec);
61 |   }
62 | 
63 |   public void write(DataOutput out) throws IOException {
64 |     if (record!=null) {
65 |       record.write(out);
66 |     }
67 |   }
68 |   
69 |   public void readFields(DataInput in) throws IOException {
70 |     if (record!=null) {
71 |       record.readFields(in);
72 |     }
73 |   }
74 |   
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/IProcessWarcRecord.java:
--------------------------------------------------------------------------------
 1 | package org.commoncrawl.examples.java_warc;
 2 | 
 3 | /**
 4 |  * author: Mark Watson
 5 |  */
 6 | 
 7 | /**
 8 |  * callback interface for handling WARC record data
 9 |  */
10 | public interface IProcessWarcRecord {
11 |   public void process(String url, String content);
12 |   public void done();  // called once when there is no more data to be processed
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/ReadS3Bucket.java:
--------------------------------------------------------------------------------
 1 | package org.commoncrawl.examples.java_warc;
 2 | 
 3 | import com.amazonaws.services.s3.AmazonS3;
 4 | import com.amazonaws.services.s3.AmazonS3Client;
 5 | import com.amazonaws.services.s3.model.GetObjectRequest;
 6 | import com.amazonaws.services.s3.model.ObjectListing;
 7 | import com.amazonaws.services.s3.model.S3Object;
 8 | import com.amazonaws.services.s3.model.S3ObjectSummary;
 9 | import edu.cmu.lemurproject.WarcHTMLResponseRecord;
10 | import edu.cmu.lemurproject.WarcRecord;
11 | 
12 | import java.io.DataInputStream;
13 | import java.io.InputStream;
14 | import java.util.List;
15 | import java.util.zip.GZIPInputStream;
16 | 
17 | /**
18 |  * author: Mark Watson
19 |  */
20 | public class ReadS3Bucket {
21 |   static public void process(AmazonS3 s3, String bucketName, String prefix, int max) {
22 |     int count = 0;
23 | 
24 |     // use a callback class for handling WARC record data:
25 |     IProcessWarcRecord processor = new SampleProcessWarcRecord();
26 | 
27 |     ObjectListing list = s3.listObjects(bucketName, prefix);
28 | 
29 |     do {  // reading summaries code derived from stackoverflow example posted by Alberto A. Medina:
30 | 
31 |       List<S3ObjectSummary> summaries = list.getObjectSummaries();
32 |       for (S3ObjectSummary summary : summaries) {
33 |         try {
34 |           String key = summary.getKey();
35 |           System.out.println("+ key: " + key);
36 |           S3Object object = s3.getObject(new GetObjectRequest(bucketName, key));
37 |           InputStream objectData = object.getObjectContent();
38 |           GZIPInputStream gzInputStream=new GZIPInputStream(objectData);
39 |           DataInputStream inStream = new DataInputStream(gzInputStream);
40 | 
41 |           WarcRecord thisWarcRecord;
42 |           while ((thisWarcRecord = WarcRecord.readNextWarcRecord(inStream)) != null) {
43 |             //System.out.println("-- thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType());
44 |             if (thisWarcRecord.getHeaderRecordType().equals("response")) {
45 |               WarcHTMLResponseRecord htmlRecord = new WarcHTMLResponseRecord(thisWarcRecord);
46 |               String thisTargetURI = htmlRecord.getTargetURI();
47 |               String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
48 |               // handle WARC record content:
49 |               processor.process(thisTargetURI, thisContentUtf8);
50 |             }
51 |           }
52 |           inStream.close();
53 |         } catch (Exception ex) {
54 |           ex.printStackTrace();
55 |         }
56 |         if (++count >= max) return;
57 |       }
58 |       list = s3.listNextBatchOfObjects(list);
59 |     } while (list.isTruncated());
60 |     // done processing all WARC records:
61 |     processor.done();
62 | 
63 |   }
64 | 
65 |   static public void main(String[] args) {
66 |     AmazonS3Client s3 = new AmazonS3Client();
67 |     process(s3, "commoncrawl", "crawl-data/CC-MAIN-2013-48", 20);
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/ReadWARC.java:
--------------------------------------------------------------------------------
 1 | // based on an example from http://boston.lti.cs.cmu.edu/clueweb09/wiki/tiki-index.php?page=Working+with+WARC+Files
 2 | 
 3 | package org.commoncrawl.examples.java_warc;
 4 | 
 5 | import java.io.DataInputStream;
 6 | import java.io.FileInputStream;
 7 | import java.io.IOException;
 8 | import java.util.zip.GZIPInputStream;
 9 | import edu.cmu.lemurproject.WarcRecord;
10 | import edu.cmu.lemurproject.WarcHTMLResponseRecord;
11 | 
12 | public class ReadWARC {
13 | 
14 |   public static void main(String[] args) throws IOException {
15 | 
16 |     // use a callback class for handling WARC record data:
17 |     IProcessWarcRecord processor = new SampleProcessWarcRecord();
18 | 
19 |     String inputWarcFile="CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz";
20 |     GZIPInputStream gzInputStream=new GZIPInputStream(new FileInputStream(inputWarcFile));
21 |     DataInputStream inStream=new DataInputStream(gzInputStream);
22 | 
23 |     WarcRecord thisWarcRecord;
24 |     while ((thisWarcRecord=WarcRecord.readNextWarcRecord(inStream))!=null) {
25 |       System.out.println("%% thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType());
26 |       if (thisWarcRecord.getHeaderRecordType().equals("response")) {
27 |         WarcHTMLResponseRecord htmlRecord=new WarcHTMLResponseRecord(thisWarcRecord);
28 |         String thisTargetURI=htmlRecord.getTargetURI();
29 |         String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
30 | 
31 |         // handle WARC record content:
32 |         processor.process(thisTargetURI, thisContentUtf8);
33 |       }
34 |     }
35 |     inStream.close();
36 |     // done processing all WARC records:
37 |     processor.done();
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/SampleProcessWarcRecord.java:
--------------------------------------------------------------------------------
 1 | package org.commoncrawl.examples.java_warc;
 2 | 
 3 | /**
 4 |  * author: Mark Watson
 5 |  */
 6 | 
 7 | /**
 8 |  * a sample callback class for handling WARC record data by implementing IProcessWarcRecord interface
 9 |  */
10 | public class SampleProcessWarcRecord implements IProcessWarcRecord {
11 |   @Override
12 |   public void process(String url, String content) {
13 |     System.out.println("url: " + url);
14 |     System.out.println("content: " + url + "\n\n" + content + "\n");
15 |   }
16 | 
17 |   @Override
18 |   public void done() {
19 |     // place any code hear to save data, etc.
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------