├── .gitattributes
├── .gitignore
├── README.md
├── clojure-examples
├── .gitignore
├── README.md
├── project.clj
├── src
│ └── clojure_examples
│ │ └── core.clj
└── test
│ └── clojure_examples
│ └── core_test.clj
├── pom.xml
└── src
└── main
└── java
├── edu
└── cmu
│ └── lemurproject
│ ├── WarcFileInputFormat.java
│ ├── WarcFileRecordReader.java
│ ├── WarcHTMLResponseRecord.java
│ ├── WarcRecord.java
│ └── WritableWarcRecord.java
└── org
└── commoncrawl
└── examples
└── java_warc
├── IProcessWarcRecord.java
├── ReadS3Bucket.java
├── ReadWARC.java
└── SampleProcessWarcRecord.java
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.class
3 | target
4 | example-warc*
5 | *~
6 | clojure-examples/target
7 | clojure-examples/.lein*
8 | clojure-examples/clojure-examples-for-common-crawl.i*
9 |
10 | # Eclipse ignores
11 | .classpath
12 | .project
13 | .settings/
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Java and Clojure examples for processing Common Crawl WARC files
2 |
3 | Mark Watson 2014/1/26
4 |
5 | There are two Java examples and one Clojure example for now (more to come):
6 |
7 | - ReadWARC - reads a local WARC file that was manually copied from S3 storage to your laptop
8 | - ReadS3Bucket - this should be run on an EC2 instance for fast access to S3
9 | - clojure-examples/src/clojure-examples/core.clj - reads a local WARC file that was manually copied from S3 storage to your laptop
10 |
11 | A JDK 1.7 or later is required (JDK 1.6 will not work).
12 |
13 | Special thanks to the developers of the edu.cmu.lemurproject package from Carnegie Mellon University. This code
14 | reads WARC files and the source code is included in the src subdirectory.
15 |
16 | I have just started experimenting with Common Crawl data. I plan on adding a Hadoop/Elastic MapReduce example
17 | and also more examples using other JVM languages like Clojure and JRuby.
18 |
19 | ## ReadWARC
20 |
21 | Assuming that you have the aws command line tools installed, you can list the contents of a crawl using:
22 |
23 | ````````
24 | aws s3 ls s3://commoncrawl/crawl-data/CC-MAIN-2014-10/ --recursive | head -6
25 | ````````
26 |
27 | You can copy one segment to your laptop (segment files are less than 1 gigabytes) using:
28 |
29 | ````````
30 | aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2014-10/segments/1394023864559/warc/CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz .
31 | ````````
32 |
33 | Then run this example using:
34 |
35 | ````````
36 | mvn install
37 | mvn exec:java -Dexec.mainClass=org.commoncrawl.examples.java_warc.ReadWARC
38 | ````````
39 |
40 |
41 | ## ReadS3Bucket
42 |
43 | You can set the maximum number of segment files to process using the **max** argument:
44 |
45 | ````````
46 | public class ReadS3Bucket {
47 | static public void process(AmazonS3 s3, String bucketName, String prefix, int max) {
48 | ````````
49 |
50 | As you can see in the example code, I pass the bucket and prefix as:
51 |
52 | ````````
53 | process(s3, "commoncrawl", "crawl-data/CC-MAIN-2014-10", 2);
54 | ````````
55 |
56 | Note, using the Common Crawl AMI (I run it on a Medium EC2 instance), I installed JDK 1.7 (required for
57 | the edu.cmu.lemurproject package):
58 |
59 | ````````
60 | sudo yum install java-1.7.0-openjdk-devel.x86_64
61 | ````````
62 |
63 | TODO: In addition to installing Java 7, you also need to configure it
64 | using
65 |
66 | `sudo alternatives --config javac`
67 | `sudo alternatives --config java`
68 |
69 | TODO: Maven needs to be installed and it's not available through yum
70 | without some gymnastics.
71 |
72 | After cloning the Github repository to get these examples on an EC2 instance:
73 |
74 |
75 | ```
76 | git clone https://github.com/commoncrawl/example-warc-java.git
77 | cd example-warc-java
78 | ```
79 |
80 | build and run using:
81 |
82 | ````````
83 | mvn install
84 | mvn exec:java -Dexec.mainClass=org.commoncrawl.examples.java_warc.ReadS3Bucket
85 | ````````
86 |
87 | Note: I also tested this using a micro EC2 instance. The time to process two gzipped segment files
88 | (of size a little less than 1 gigabyte each) is about 45 seconds on a micro EC2 instance.
89 |
90 | ## Clojure Examples
91 |
92 | You need to install the commoncrawl JAR file in your local maven repository:
93 |
94 | ````````
95 | mvn install:install-file -Durl=file:repo -DpomFile=pom.xml -DgroupId=local -DartifactId=commoncrawl -Dversion=0.0.1 -Dpackaging=jar -Dfile=target/commoncrawl-0.0.1.jar
96 | ````````
97 |
98 | Then you can:
99 |
100 | ````````
101 | cd clojure-examples
102 | lein deps
103 | lein test
104 | ````````
105 |
106 | ## License
107 |
108 | This code is licensed under the Apache 2 license. Please give back to
109 | Common Crawl if you found it useful.
110 |
111 |
112 |
--------------------------------------------------------------------------------
/clojure-examples/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /lib
3 | /classes
4 | /checkouts
5 | pom.xml
6 | pom.xml.asc
7 | *.jar
8 | *.class
9 | .lein-deps-sum
10 | .lein-failures
11 | .lein-plugins
12 | .lein-repl-history
13 |
--------------------------------------------------------------------------------
/clojure-examples/README.md:
--------------------------------------------------------------------------------
1 | # clojure-examples for reading Common Crawl WARC files
2 |
3 | Note: this is still a work in progress, but if you use Clojure, please try this simple example and provide feedback.
4 |
5 | First, build the parent Java project and make sure to install the generated maven project JAR file into
6 | your local maven repository using:
7 |
8 | ````````
9 | mvn install:install-file -Durl=file:repo -DpomFile=pom.xml -DgroupId=local -DartifactId=commoncrawl -Dversion=0.0.1 -Dpackaging=jar -Dfile=target/commoncrawl-0.0.1.jar
10 | ````````
11 |
12 | in the parent directory. Copy a single test WARC file from S3:
13 |
14 | ````````
15 | aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00002-ip-10-33-133-15.ec2.internal.warc.gz .
16 | ````````
17 |
18 | Then you can run
19 |
20 | ````````
21 | lein test
22 | ````````
23 |
24 |
25 |
--------------------------------------------------------------------------------
/clojure-examples/project.clj:
--------------------------------------------------------------------------------
1 | (defproject clojure-examples "0.1.0-SNAPSHOT"
2 | :description "FIXME: write description"
3 | :url "http://example.com/FIXME"
4 | :license {:name "Eclipse Public License"
5 | :url "http://www.eclipse.org/legal/epl-v10.html"}
6 | :dependencies [[org.clojure/clojure "1.5.1"]
7 | [local/commoncrawl "0.0.1"]]
8 | )
9 |
--------------------------------------------------------------------------------
/clojure-examples/src/clojure_examples/core.clj:
--------------------------------------------------------------------------------
1 | (ns clojure-examples.core
2 | (:import [java.io DataInputStream FileInputStream])
3 | (:import [java.util.zip GZIPInputStream])
4 | (:import [edu.cmu.lemurproject WarcRecord WarcHTMLResponseRecord])
5 | )
6 |
7 |
8 | (defn single-warc-file []
9 | (let [input-warc-file "CC-MAIN-20131204131715-00002-ip-10-33-133-15.ec2.internal.warc.gz"
10 | gz-input-stream (GZIPInputStream. (FileInputStream. input-warc-file))
11 | in-stream (DataInputStream. gz-input-stream)]
12 | (defn read-warc-record []
13 | (let [r (WarcRecord/readNextWarcRecord in-stream)]
14 | (if (= (.getHeaderRecordType r) "response")
15 | (let [html-record (WarcHTMLResponseRecord. r)
16 | uri (.getTargetURI html-record)
17 | content (.getContentUTF8 (.getRawRecord html-record))]
18 | (println uri)
19 | ;;(println content)
20 | ))))
21 | (dotimes [n 50] (read-warc-record))))
22 |
23 | ;; (single-warc-file)
--------------------------------------------------------------------------------
/clojure-examples/test/clojure_examples/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns clojure-examples.core-test
2 | (:use clojure.test
3 | clojure-examples.core))
4 |
5 | (single-warc-file)
6 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | org.commoncrawl
6 | commoncrawl
7 | 0.0.1
8 | jar
9 |
10 |
11 |
12 | Java examples for using Common Crawl WARC files from S3
13 |
14 |
15 |
16 |
17 | com.amazonaws
18 | aws-java-sdk
19 | 1.0.002
20 |
21 |
22 | commons-codec
23 | commons-codec
24 | 1.4
25 |
26 |
27 | commons-io
28 | commons-io
29 | 2.4
30 |
31 |
32 | commons-logging
33 | commons-logging
34 | 1.1.3
35 |
36 |
37 | com.facebook.hadoop
38 | hadoop-core
39 | 0.20.0
40 |
41 |
42 | commons-httpclient
43 | commons-httpclient
44 | 3.1
45 |
46 |
47 | org.apache.httpcomponents
48 | httpcore
49 | 4.2.5
50 |
51 |
52 | org.codehaus.jackson
53 | jackson-core-asl
54 | 1.9.10
55 |
56 |
57 | org.codehaus.jackson
58 | jackson-mapper-asl
59 | 1.9.10
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-compiler-plugin
69 | 3.1
70 |
71 | 1.7
72 | 1.7
73 |
74 |
75 |
76 |
77 | org.codehaus.mojo
78 | exec-maven-plugin
79 | 1.1.1
80 |
81 |
82 | exec
83 |
84 | java
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcFileInputFormat.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Hadoop FileInputFormat for reading WARC files
3 | *
4 | * (C) 2009 - Carnegie Mellon University
5 | *
6 | * 1. Redistributions of this source code must retain the above copyright
7 | * notice, this list of conditions and the following disclaimer.
8 | * 2. The names "Lemur", "Indri", "University of Massachusetts",
9 | * "Carnegie Mellon", and "lemurproject" must not be used to
10 | * endorse or promote products derived from this software without
11 | * prior written permission. To obtain permission, contact
12 | * license@lemurproject.org.
13 | *
14 | * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 | * nor may "Lemur" or "Indri" appear in their names without prior written
16 | * permission of The Lemur Project. To obtain permission,
17 | * contact license@lemurproject.org.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
21 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
23 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
24 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | * POSSIBILITY OF SUCH DAMAGE.
31 | *
32 | * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 | */
34 |
35 | package edu.cmu.lemurproject;
36 |
37 | import java.io.IOException;
38 | import org.apache.hadoop.fs.FileSystem;
39 | import org.apache.hadoop.fs.Path;
40 | import org.apache.hadoop.io.LongWritable;
41 | import org.apache.hadoop.mapred.FileInputFormat;
42 | import org.apache.hadoop.mapred.InputSplit;
43 | import org.apache.hadoop.mapred.JobConf;
44 | import org.apache.hadoop.mapred.RecordReader;
45 | import org.apache.hadoop.mapred.Reporter;
46 |
47 | public class WarcFileInputFormat extends FileInputFormat {
48 |
49 | /**
50 | * Don't allow the files to be split!
51 | */
52 | @Override
53 | protected boolean isSplitable(FileSystem fs, Path filename) {
54 | // ensure the input files are not splittable!
55 | return false;
56 | }
57 |
58 | /**
59 | * Just return the record reader
60 | */
61 | public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
62 | return new WarcFileRecordReader(conf, split);
63 | }
64 | }
65 |
66 |
--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcFileRecordReader.java:
--------------------------------------------------------------------------------
1 | /**
2 | * A Hadoop record reader for reading Warc Records
3 | *
4 | * (C) 2009 - Carnegie Mellon University
5 | *
6 | * 1. Redistributions of this source code must retain the above copyright
7 | * notice, this list of conditions and the following disclaimer.
8 | * 2. The names "Lemur", "Indri", "University of Massachusetts",
9 | * "Carnegie Mellon", and "lemurproject" must not be used to
10 | * endorse or promote products derived from this software without
11 | * prior written permission. To obtain permission, contact
12 | * license@lemurproject.org.
13 | *
14 | * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 | * nor may "Lemur" or "Indri" appear in their names without prior written
16 | * permission of The Lemur Project. To obtain permission,
17 | * contact license@lemurproject.org.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
21 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
23 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
24 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | * POSSIBILITY OF SUCH DAMAGE.
31 | *
32 | * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 | */
34 |
35 | package edu.cmu.lemurproject;
36 |
37 | import edu.cmu.lemurproject.WarcRecord;
38 | import java.io.DataInputStream;
39 | import java.io.IOException;
40 | import org.apache.commons.logging.Log;
41 | import org.apache.commons.logging.LogFactory;
42 | import org.apache.hadoop.conf.Configuration;
43 | import org.apache.hadoop.fs.FSDataInputStream;
44 | import org.apache.hadoop.fs.FileSystem;
45 | import org.apache.hadoop.fs.Path;
46 | import org.apache.hadoop.io.LongWritable;
47 | import org.apache.hadoop.io.Writable;
48 | import org.apache.hadoop.io.WritableComparable;
49 | import org.apache.hadoop.io.compress.CompressionCodec;
50 | import org.apache.hadoop.mapred.FileSplit;
51 | import org.apache.hadoop.mapred.InputSplit;
52 | import org.apache.hadoop.mapred.MultiFileSplit;
53 | import org.apache.hadoop.mapred.RecordReader;
54 | import org.apache.hadoop.util.ReflectionUtils;
55 |
56 | public class WarcFileRecordReader implements RecordReader {
57 | public static final Log LOG = LogFactory.getLog(WarcFileRecordReader.class);
58 |
59 | private long recordNumber=1;
60 |
61 | private Path[] filePathList=null;
62 | private int currentFilePath=-1;
63 |
64 | private FSDataInputStream currentFile=null;
65 | private CompressionCodec compressionCodec=null;
66 | private DataInputStream compressionInput=null;
67 | private FileSystem fs=null;
68 | private long totalFileSize=0;
69 | private long totalNumBytesRead=0;
70 |
71 | public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
72 | if (split instanceof FileSplit) {
73 | this.filePathList=new Path[1];
74 | this.filePathList[0]=((FileSplit)split).getPath();
75 | } else if (split instanceof MultiFileSplit) {
76 | this.filePathList=((MultiFileSplit)split).getPaths();
77 | } else {
78 | throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
79 | }
80 |
81 | fs = this.filePathList[0].getFileSystem(conf);
82 |
83 | // get the total file sizes
84 | for (int i=0; i < filePathList.length; i++) {
85 | totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
86 | }
87 |
88 | Class extends CompressionCodec> codecClass=null;
89 |
90 | try {
91 | codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
92 | compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
93 | } catch (ClassNotFoundException cnfEx) {
94 | compressionCodec=null;
95 | LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
96 | }
97 |
98 | openNextFile();
99 | }
100 |
101 | private boolean openNextFile() {
102 | try {
103 | if (compressionInput!=null) {
104 | compressionInput.close();
105 | } else if (currentFile!=null) {
106 | currentFile.close();
107 | }
108 | currentFile=null;
109 | compressionInput=null;
110 |
111 | currentFilePath++;
112 | if (currentFilePath >= filePathList.length) { return false; }
113 |
114 | currentFile=this.fs.open(filePathList[currentFilePath]);
115 |
116 | // is the file gzipped?
117 | if ((compressionCodec!=null) && (filePathList[currentFilePath].getName().endsWith("gz"))) {
118 | compressionInput=new DataInputStream(compressionCodec.createInputStream(currentFile));
119 | LOG.info("Compression enabled");
120 | }
121 |
122 | } catch (IOException ex) {
123 | LOG.info("IOError opening " + filePathList[currentFilePath].toString() + " - message: " + ex.getMessage());
124 | return false;
125 | }
126 | return true;
127 | }
128 |
129 | public boolean next(LongWritable key, WritableWarcRecord value) throws IOException {
130 | DataInputStream whichStream=null;
131 | if (compressionInput!=null) {
132 | whichStream=compressionInput;
133 | } else if (currentFile!=null) {
134 | whichStream=currentFile;
135 | }
136 |
137 | if (whichStream==null) { return false; }
138 |
139 | WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream);
140 | if (newRecord==null) {
141 | // try advancing the file
142 | if (openNextFile()) {
143 | newRecord=WarcRecord.readNextWarcRecord(whichStream);
144 | }
145 |
146 | if (newRecord==null) { return false; }
147 | }
148 |
149 | totalNumBytesRead += (long)newRecord.getTotalRecordLength();
150 | newRecord.setWarcFilePath(filePathList[currentFilePath].toString());
151 |
152 | // now, set our output variables
153 | value.setRecord(newRecord);
154 | key.set(recordNumber);
155 |
156 | recordNumber++;
157 | return true;
158 | }
159 |
160 | public LongWritable createKey() {
161 | return new LongWritable();
162 | }
163 |
164 | public WritableWarcRecord createValue() {
165 | return new WritableWarcRecord();
166 | }
167 |
168 | public long getPos() throws IOException {
169 | return totalNumBytesRead;
170 | }
171 |
172 | public void close() throws IOException {
173 | totalNumBytesRead=totalFileSize;
174 | if (compressionInput!=null) {
175 | compressionInput.close();
176 | } else if (currentFile!=null) {
177 | currentFile.close();
178 | }
179 | }
180 |
181 | public float getProgress() throws IOException {
182 | if (compressionInput!=null) {
183 | if (filePathList.length==0) { return 1.0f; }
184 | // return which file - can't do extact byte matching
185 | return (float)currentFilePath / (float)(filePathList.length);
186 | }
187 | if (totalFileSize==0) { return 0.0f; }
188 | return (float)totalNumBytesRead/(float)totalFileSize;
189 | }
190 |
191 | }
192 |
--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcHTMLResponseRecord.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Container for a Warc Record of type "response"
3 | *
4 | * (C) 2009 - Carnegie Mellon University
5 | *
6 | * 1. Redistributions of this source code must retain the above copyright
7 | * notice, this list of conditions and the following disclaimer.
8 | * 2. The names "Lemur", "Indri", "University of Massachusetts",
9 | * "Carnegie Mellon", and "lemurproject" must not be used to
10 | * endorse or promote products derived from this software without
11 | * prior written permission. To obtain permission, contact
12 | * license@lemurproject.org.
13 | *
14 | * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 | * nor may "Lemur" or "Indri" appear in their names without prior written
16 | * permission of The Lemur Project. To obtain permission,
17 | * contact license@lemurproject.org.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
21 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
23 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
24 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | * POSSIBILITY OF SUCH DAMAGE.
31 | *
32 | * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 | */
34 |
35 | package edu.cmu.lemurproject;
36 |
37 | import java.io.BufferedReader;
38 | import java.io.ByteArrayInputStream;
39 | import java.io.IOException;
40 | import java.io.InputStreamReader;
41 | import java.net.URISyntaxException;
42 | import java.util.HashSet;
43 | import java.util.Iterator;
44 | import java.util.Vector;
45 | import java.util.regex.Matcher;
46 | import java.util.regex.Pattern;
47 |
48 | public class WarcHTMLResponseRecord {
49 |
50 | private WarcRecord warcRecord=new WarcRecord();
51 |
52 | private static String SINGLE_SPACE=" ";
53 |
54 | private static Pattern ALL_HTML_TAGS=Pattern.compile("<(.*?)>");
55 | private static Pattern A_HREF_PATTERN=Pattern.compile("[aA].+?[hH][rR][eE][fF]=['\"](.+?)['\"].*?");
56 | private static Pattern AREA_HREF_PATTERN=Pattern.compile("[aA][rR][eE][aA].+?[hH][rR][eE][fF]=['\"](.*?)['\"].*?");
57 | private static Pattern FRAME_SRC_PATTERN=Pattern.compile("[fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
58 | private static Pattern IFRAME_SRC_PATTERN=Pattern.compile("[iI][fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
59 | private static Pattern HTTP_START_PATTERN=Pattern.compile("^[hH][tT][tT][pP][sS]?://.*");
60 |
61 | // create our pattern set
62 | private Vector patternSet=new Vector();
63 |
64 | /**
65 | * Default constructor
66 | */
67 | public WarcHTMLResponseRecord() {
68 | createPatternSet();
69 | }
70 |
71 | /**
72 | * Copy constructor
73 | * @param o
74 | */
75 | public WarcHTMLResponseRecord(WarcHTMLResponseRecord o) {
76 | this.warcRecord.set(o.warcRecord);
77 | createPatternSet();
78 | }
79 |
80 | /**
81 | * Constructor creation from a generic WARC record
82 | * @param o
83 | */
84 | public WarcHTMLResponseRecord(WarcRecord o) {
85 | if (o.getHeaderRecordType().compareToIgnoreCase("response")==0) {
86 | this.warcRecord.set(o);
87 | }
88 | createPatternSet();
89 | }
90 |
91 | private void createPatternSet() {
92 | patternSet.add(A_HREF_PATTERN);
93 | patternSet.add(AREA_HREF_PATTERN);
94 | patternSet.add(FRAME_SRC_PATTERN);
95 | patternSet.add(IFRAME_SRC_PATTERN);
96 | }
97 |
98 | public void setRecord(WarcRecord o) {
99 | if (o.getHeaderRecordType().compareToIgnoreCase("response")==0) {
100 | this.warcRecord.set(o);
101 | }
102 | }
103 |
104 | public WarcRecord getRawRecord() {
105 | return warcRecord;
106 | }
107 |
108 | public String getTargetURI() {
109 | return warcRecord.getHeaderMetadataItem("WARC-Target-URI");
110 | }
111 |
112 | public String getTargetTrecID() {
113 | return warcRecord.getHeaderMetadataItem("WARC-TREC-ID");
114 | }
115 |
116 | private String getNormalizedContentURL(String pageURL, String contentURL) {
117 | String fixedContentURL = contentURL;
118 | try {
119 | // resolve any potentially relative paths to the full URL based on the page
120 | java.net.URI baseURI = new java.net.URI(pageURL);
121 | // ensure that the content doesn't have query parameters - if so, strip them
122 | int contentParamIndex = contentURL.indexOf("?");
123 | if (contentParamIndex > 0) {
124 | fixedContentURL = contentURL.substring(0, contentParamIndex);
125 | }
126 | java.net.URI resolvedURI = baseURI.resolve(fixedContentURL);
127 | return resolvedURI.toString();
128 | } catch (URISyntaxException ex) {
129 | } catch (java.lang.IllegalArgumentException iaEx) {
130 | return fixedContentURL;
131 | } catch (Exception gEx) {
132 | }
133 | return "";
134 | }
135 |
136 | private HashSet getMatchesOutputSet(Vector tagSet, String baseURL) {
137 | HashSet retSet=new HashSet();
138 |
139 | Iterator vIter=tagSet.iterator();
140 | while (vIter.hasNext()) {
141 | String thisCheckPiece=vIter.next();
142 | Iterator pIter=patternSet.iterator();
143 | boolean hasAdded=false;
144 | while (!hasAdded && pIter.hasNext()) {
145 | Pattern thisPattern=pIter.next();
146 | Matcher matcher=thisPattern.matcher(thisCheckPiece);
147 | if (matcher.find() && (matcher.groupCount() > 0)) {
148 | String thisMatch=getNormalizedContentURL(baseURL, matcher.group(1));
149 | if (HTTP_START_PATTERN.matcher(thisMatch).matches()) {
150 | if (!retSet.contains(thisMatch) && !baseURL.equals(thisMatch)) {
151 | retSet.add(thisMatch);
152 | hasAdded=true;
153 | } // end if (!retSet.contains(thisMatch))
154 | } // end if (HTTP_START_PATTERN.matcher(thisMatch).matches())
155 | } // end if (matcher.find() && (matcher.groupCount() > 0))
156 | matcher.reset();
157 | } // end while (!hasAdded && pIter.hasNext())
158 | } // end while (vIter.hasNext())
159 |
160 | return retSet;
161 | }
162 |
163 | /**
164 | * Gets a vector of normalized URLs (normalized to this target URI)
165 | * of the outlinks of the page
166 | * @return
167 | */
168 | public Vector getURLOutlinks() {
169 | Vector retVec = new Vector();
170 |
171 | String baseURL = getTargetURI();
172 | if ((baseURL == null) || (baseURL.length() == 0)) {
173 | return retVec;
174 | }
175 |
176 | byte[] contentBytes=warcRecord.getContent();
177 |
178 | ByteArrayInputStream contentStream=new ByteArrayInputStream(contentBytes);
179 | BufferedReader inReader=new BufferedReader(new InputStreamReader(contentStream));
180 |
181 | // forward to the first \n\n
182 | try {
183 | boolean inHeader=true;
184 | String line=null;
185 | while (inHeader && ((line=inReader.readLine())!=null)) {
186 | if (line.trim().length()==0) {
187 | inHeader=false;
188 | }
189 | }
190 |
191 | // now we have the rest of the lines
192 | // read them all into a string buffer
193 | // to remove all new lines
194 | Vector htmlTags=new Vector();
195 | while ((line=inReader.readLine())!=null) {
196 | // get all HTML tags from the line...
197 | Matcher HTMLMatcher=ALL_HTML_TAGS.matcher(line);
198 | while (HTMLMatcher.find()) {
199 | htmlTags.add(HTMLMatcher.group(1));
200 | }
201 | }
202 |
203 | HashSet retSet=getMatchesOutputSet(htmlTags, baseURL);
204 |
205 | Iterator oIter=retSet.iterator();
206 | while (oIter.hasNext()) {
207 | String thisValue=oIter.next();
208 | if (!thisValue.equals(baseURL)) {
209 | retVec.add(thisValue);
210 | }
211 | }
212 |
213 | } catch (IOException ioEx) {
214 | retVec.clear();
215 | }
216 |
217 | return retVec;
218 | }
219 |
220 | }
221 |
--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WarcRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | Lemur License Agreement
3 |
4 | Copyright (c) 2000-2011 The Lemur Project. All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions
8 | are met:
9 |
10 | 1. Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 |
13 | 2. Redistributions in binary form must reproduce the above copyright
14 | notice, this list of conditions and the following disclaimer in
15 | the documentation and/or other materials provided with the
16 | distribution.
17 |
18 | 3. The names "Lemur", "Indri", "University of Massachusetts" and
19 | "Carnegie Mellon" must not be used to endorse or promote products
20 | derived from this software without prior written permission. To
21 | obtain permission, contact license@lemurproject.org
22 |
23 | 4. Products derived from this software may not be called "Lemur" or "Indri"
24 | nor may "Lemur" or "Indri" appear in their names without prior written
25 | permission of The Lemur Project. To obtain permission,
26 | contact license@lemurproject.org.
27 |
28 | THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AND OTHER
29 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
30 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
31 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32 | COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
34 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
35 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
37 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
38 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
39 | DAMAGE.
40 |
41 | */
42 | /*
43 | * To change this template, choose Tools | Templates
44 | * and open the template in the editor.
45 | */
46 | package edu.cmu.lemurproject;
47 |
48 | import java.io.DataInput;
49 | import java.io.DataInputStream;
50 | import java.io.DataOutput;
51 | import java.io.EOFException;
52 | import java.io.IOException;
53 | import java.io.UnsupportedEncodingException;
54 | import java.util.HashMap;
55 | import java.util.Iterator;
56 | import java.util.Map.Entry;
57 | import java.util.Set;
58 | // import org.apache.commons.logging.Log;
59 | // import org.apache.commons.logging.LogFactory;
60 |
61 | /**
62 | *
63 | * @author mhoy
64 | */
65 | public class WarcRecord {
66 |
67 | // public static final Log LOG = LogFactory.getLog(WarcRecord.class);
68 |
69 | public static String WARC_VERSION = "WARC/";
70 | public static String WARC_VERSION_LINE = "WARC/0.18\n";
71 |
72 | ////public static String WARC_VERSION = "WARC/1.0";
73 | //public static String WARC_VERSION = "WARC/0.18";
74 | ////public static String WARC_VERSION_LINE = "WARC/1.0\n";
75 | //public static String WARC_VERSION_LINE = "WARC/0.18\n";
76 | private static String NEWLINE="\n";
77 | private static String CR_NEWLINE="\r\n";
78 |
79 | private static byte MASK_THREE_BYTE_CHAR=(byte)(0xE0);
80 | private static byte MASK_TWO_BYTE_CHAR=(byte)(0xC0);
81 | private static byte MASK_TOPMOST_BIT=(byte)(0x80);
82 | private static byte MASK_BOTTOM_SIX_BITS=(byte)(0x1F);
83 | private static byte MASK_BOTTOM_FIVE_BITS=(byte)(0x3F);
84 | private static byte MASK_BOTTOM_FOUR_BITS=(byte)(0x0F);
85 |
86 | private static String LINE_ENDING="\n";
87 |
88 | private static String readLineFromInputStream(DataInputStream in) throws IOException {
89 | StringBuilder retString=new StringBuilder();
90 | boolean found_cr = false;
91 | boolean keepReading=true;
92 | try {
93 | do {
94 | char thisChar=0;
95 | byte readByte=in.readByte();
96 | // check to see if it's a multibyte character
97 | if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
98 | found_cr = false;
99 | // need to read the next 2 bytes
100 | if (in.available() < 2) {
101 | // treat these all as individual characters
102 | retString.append((char)readByte);
103 | int numAvailable=in.available();
104 | for (int i=0; i < numAvailable; i++) {
105 | retString.append((char)(in.readByte()));
106 | }
107 | continue;
108 | }
109 | byte secondByte=in.readByte();
110 | byte thirdByte=in.readByte();
111 | // ensure the topmost bit is set
112 | if (((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT)) {
113 | //treat these as individual characters
114 | retString.append((char)readByte);
115 | retString.append((char)secondByte);
116 | retString.append((char)thirdByte);
117 | continue;
118 | }
119 | int finalVal=(thirdByte & MASK_BOTTOM_FIVE_BITS) + 64*(secondByte & MASK_BOTTOM_FIVE_BITS) + 4096*(readByte & MASK_BOTTOM_FOUR_BITS);
120 | thisChar=(char)finalVal;
121 | } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
122 | found_cr = false;
123 |
124 | // need to read next byte
125 | if (in.available() < 1) {
126 | // treat this as individual characters
127 | retString.append((char)readByte);
128 | continue;
129 | }
130 | byte secondByte=in.readByte();
131 | if ((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) {
132 | retString.append((char)readByte);
133 | retString.append((char)secondByte);
134 | continue;
135 | }
136 | int finalVal=(secondByte & MASK_BOTTOM_FIVE_BITS) + 64*(readByte & MASK_BOTTOM_SIX_BITS);
137 | thisChar=(char)finalVal;
138 | } else {
139 | // interpret it as a single byte
140 | thisChar=(char)readByte;
141 | }
142 | // Look for carriage return; if found set a flag
143 | if (thisChar=='\r') {
144 | found_cr = true;
145 | }
146 | if (thisChar=='\n') {
147 | // if the linefeed is the next character after the carriage return
148 | if (found_cr) {
149 | LINE_ENDING = CR_NEWLINE;
150 | } else {
151 | LINE_ENDING = NEWLINE;
152 | }
153 | keepReading=false;
154 | } else {
155 | retString.append(thisChar);
156 | }
157 | } while (keepReading);
158 | } catch (EOFException eofEx) {
159 | return null;
160 | }
161 |
162 | if (retString.length()==0) {
163 | return "";
164 | }
165 |
166 | return retString.toString();
167 | }
168 |
169 | private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer) throws IOException {
170 | if (in==null) { return null; }
171 | if (headerBuffer==null) { return null; }
172 |
173 | String line=null;
174 | boolean foundMark=false;
175 | byte[] retContent=null;
176 |
177 | // cannot be using a buffered reader here!!!!
178 | // just read the header
179 | // first - find our WARC header
180 | while ((!foundMark) && ((line=readLineFromInputStream(in))!=null)) {
181 | if (line.startsWith(WARC_VERSION)) {
182 | WARC_VERSION_LINE = line;
183 | foundMark=true;
184 | }
185 | }
186 |
187 | // no WARC mark?
188 | if (!foundMark) { return null; }
189 |
190 | // LOG.info("Found WARC_VERSION");
191 |
192 | int contentLength = -1;
193 | // read until we see contentLength then an empty line
194 | // (to handle malformed ClueWeb09 headers that have blank lines)
195 | // get the content length and set our retContent
196 | for (line = readLineFromInputStream(in).trim();
197 | line.length() > 0 || contentLength < 0;
198 | line = readLineFromInputStream(in).trim()) {
199 |
200 | if (line.length() > 0 ) {
201 | headerBuffer.append(line);
202 | headerBuffer.append(LINE_ENDING);
203 |
204 | // find the content length designated by Content-Length:
205 | String[] parts = line.split(":", 2);
206 | if (parts.length == 2 && parts[0].equals("Content-Length")) {
207 | try {
208 | contentLength=Integer.parseInt(parts[1].trim());
209 | // LOG.info("WARC record content length: " + contentLength);
210 | } catch (NumberFormatException nfEx) {
211 | contentLength=-1;
212 | }
213 | }
214 | }
215 | }
216 |
217 | // now read the bytes of the content
218 | retContent=new byte[contentLength];
219 | int totalWant=contentLength;
220 | int totalRead=0;
221 | //
222 | // LOOP TO REMOVE LEADING CR * LF
223 | // To prevent last few characters from being cut off of the content
224 | // when reading
225 | //
226 | while ((totalRead == 0) && (totalRead < contentLength)) {
227 | byte CR = in.readByte();
228 | byte LF = in.readByte();
229 | if ((CR != 13) && (LF != 10)) {
230 | retContent[0] = CR;
231 | retContent[1] = LF;
232 | totalRead = 2;
233 | totalWant = contentLength - totalRead;
234 | }
235 | }
236 | //
237 | //
238 | //
239 | while (totalRead < contentLength) {
240 | try {
241 | int numRead=in.read(retContent, totalRead, totalWant);
242 | if (numRead < 0) {
243 | return null;
244 | } else {
245 | totalRead += numRead;
246 | totalWant = contentLength-totalRead;
247 | } // end if (numRead < 0) / else
248 | } catch (EOFException eofEx) {
249 | // resize to what we have
250 | if (totalRead > 0) {
251 | byte[] newReturn=new byte[totalRead];
252 | System.arraycopy(retContent, 0, newReturn, 0, totalRead);
253 | return newReturn;
254 | } else {
255 | return null;
256 | }
257 | } // end try/catch (EOFException)
258 | } // end while (totalRead < contentLength)
259 |
260 | return retContent;
261 | }
262 |
263 | public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
264 | // LOG.info("Starting read of WARC record");
265 | StringBuffer recordHeader=new StringBuffer();
266 | byte[] recordContent=readNextRecord(in, recordHeader);
267 | if (recordContent==null) {
268 | // LOG.info("WARC content is null - file is complete");
269 | return null;
270 | }
271 |
272 | // extract out our header information
273 | String thisHeaderString=recordHeader.toString();
274 |
275 |
276 | String[] headerLines=thisHeaderString.split(LINE_ENDING);
277 |
278 | WarcRecord retRecord=new WarcRecord();
279 | for (int i=0; i < headerLines.length; i++) {
280 | String[] pieces=headerLines[i].split(":", 2);
281 | if (pieces.length!=2) {
282 | retRecord.addHeaderMetadata(pieces[0], "");
283 | continue;
284 | }
285 | String thisKey=pieces[0].trim();
286 | String thisValue=pieces[1].trim();
287 |
288 | // check for known keys
289 | if (thisKey.equals("WARC-Type")) {
290 | // LOG.info("Setting WARC record type: " + thisValue);
291 | retRecord.setWarcRecordType(thisValue);
292 | } else if (thisKey.equals("WARC-Date")) {
293 | retRecord.setWarcDate(thisValue);
294 | } else if (thisKey.equals("WARC-Record-ID")) {
295 | // LOG.info("Setting WARC record ID: " + thisValue);
296 | retRecord.setWarcUUID(thisValue);
297 | } else if (thisKey.equals("Content-Type")) {
298 | retRecord.setWarcContentType(thisValue);
299 | } else {
300 | retRecord.addHeaderMetadata(thisKey, thisValue);
301 | }
302 | }
303 |
304 | // set the content
305 | retRecord.setContent(recordContent);
306 |
307 | return retRecord;
308 | }
309 |
310 | public class WarcHeader {
311 | public String contentType="";
312 | public String UUID="";
313 | public String dateString="";
314 | public String recordType="";
315 | public HashMap metadata=new HashMap();
316 | public int contentLength=0;
317 |
318 | public WarcHeader() {
319 | }
320 |
321 | public WarcHeader(WarcHeader o) {
322 | this.contentType=o.contentType;
323 | this.UUID=o.UUID;
324 | this.dateString=o.dateString;
325 | this.recordType=o.recordType;
326 | this.metadata.putAll(o.metadata);
327 | this.contentLength=o.contentLength;
328 | }
329 |
330 | public void write(DataOutput out) throws IOException {
331 | out.writeUTF(contentType);
332 | out.writeUTF(UUID);
333 | out.writeUTF(dateString);
334 | out.writeUTF(recordType);
335 | out.writeInt(metadata.size());
336 | Iterator> metadataIterator=metadata.entrySet().iterator();
337 | while (metadataIterator.hasNext()) {
338 | Entry thisEntry=metadataIterator.next();
339 | out.writeUTF(thisEntry.getKey());
340 | out.writeUTF(thisEntry.getValue());
341 | }
342 | out.writeInt(contentLength);
343 | }
344 |
345 | public void readFields(DataInput in) throws IOException {
346 | contentType=in.readUTF();
347 | UUID=in.readUTF();
348 | dateString=in.readUTF();
349 | recordType=in.readUTF();
350 | metadata.clear();
351 | int numMetaItems=in.readInt();
352 | for (int i=0; i < numMetaItems; i++) {
353 | String thisKey=in.readUTF();
354 | String thisValue=in.readUTF();
355 | metadata.put(thisKey, thisValue);
356 | }
357 | contentLength=in.readInt();
358 | }
359 |
360 | @Override
361 | public String toString() {
362 | StringBuffer retBuffer=new StringBuffer();
363 |
364 | retBuffer.append(WARC_VERSION_LINE);
365 | retBuffer.append(LINE_ENDING);
366 |
367 | retBuffer.append("WARC-Type: " + recordType + LINE_ENDING);
368 | retBuffer.append("WARC-Date: " + dateString + LINE_ENDING);
369 |
370 | Iterator> metadataIterator=metadata.entrySet().iterator();
371 | while (metadataIterator.hasNext()) {
372 | Entry thisEntry=metadataIterator.next();
373 | retBuffer.append(thisEntry.getKey());
374 | retBuffer.append(": ");
375 | retBuffer.append(thisEntry.getValue());
376 | retBuffer.append(LINE_ENDING);
377 | }
378 | // Keep this as the last WARC-...
379 | retBuffer.append("WARC-Record-ID: " + UUID + LINE_ENDING);
380 |
381 | retBuffer.append("Content-Type: " + contentType + LINE_ENDING);
382 | retBuffer.append("Content-Length: " + contentLength + LINE_ENDING);
383 |
384 | return retBuffer.toString();
385 | }
386 | }
387 |
388 | private WarcHeader warcHeader=new WarcHeader();
389 | private byte[] warcContent=null;
390 | private String warcFilePath="";
391 |
392 | public WarcRecord() {
393 |
394 | }
395 |
396 | public WarcRecord(WarcRecord o) {
397 | this.warcHeader=new WarcHeader(o.warcHeader);
398 | this.warcContent=o.warcContent;
399 | }
400 |
401 | public int getTotalRecordLength() {
402 | int headerLength=warcHeader.toString().length();
403 | return (headerLength + warcContent.length);
404 | }
405 |
406 | public void set(WarcRecord o) {
407 | this.warcHeader=new WarcHeader(o.warcHeader);
408 | this.warcContent=o.warcContent;
409 | }
410 |
411 | public String getWarcFilePath() {
412 | return warcFilePath;
413 | }
414 |
415 | public void setWarcFilePath(String path) {
416 | warcFilePath=path;
417 | }
418 |
419 | public void setWarcRecordType(String recordType) {
420 | warcHeader.recordType=recordType;
421 | }
422 |
423 | public void setWarcContentType(String contentType) {
424 | warcHeader.contentType=contentType;
425 | }
426 |
427 | public void setWarcDate(String dateString) {
428 | warcHeader.dateString=dateString;
429 | }
430 |
431 | public void setWarcUUID(String UUID) {
432 | warcHeader.UUID=UUID;
433 | }
434 |
435 | public void addHeaderMetadata(String key, String value) {
436 | //System.out.println("+-- WarRecord.addHeaderMetadata key=" + key + " value=" + value);
437 | // don't allow addition of known keys
438 | if (key.equals("WARC-Type")) { return; }
439 | if (key.equals("WARC-Date")) { return; }
440 | if (key.equals("WARC-Record-ID")) { return; }
441 | if (key.equals("Content-Type")) { return; }
442 | if (key.equals("Content-Length")) { return; }
443 |
444 | warcHeader.metadata.put(key, value);
445 | }
446 |
447 |
448 | public void clearHeaderMetadata() {
449 | warcHeader.metadata.clear();
450 | }
451 |
452 | public Set> getHeaderMetadata() {
453 | return warcHeader.metadata.entrySet();
454 | }
455 |
456 | public String getHeaderMetadataItem(String key) {
457 | //System.out.println("+++ WarRecord.getHeaderMetadataItem key=" + key); // WARC-Target-URI
458 | if (key.equals("WARC-Type")) { return warcHeader.recordType; }
459 | if (key.equals("WARC-Date")) { return warcHeader.dateString; }
460 | if (key.equals("WARC-Record-ID")) { return warcHeader.UUID; }
461 | if (key.equals("Content-Type")) { return warcHeader.contentType; }
462 | if (key.equals("Content-Length")) { return Integer.toString(warcHeader.contentLength); }
463 |
464 | return warcHeader.metadata.get(key);
465 | }
466 |
467 | public void setContent(byte[] content) {
468 | warcContent=content;
469 | warcHeader.contentLength=content.length;
470 | }
471 |
472 | public void setContent(String content) {
473 | setContent(content.getBytes());
474 | }
475 | public void setContentLength(int len) {
476 | warcHeader.contentLength=len;
477 | }
478 |
479 | public byte[] getContent() {
480 | return warcContent;
481 | }
482 | public byte[] getByteContent() {
483 | return warcContent;
484 | }
485 |
486 | public String getContentUTF8() {
487 | String retString=null;
488 | try {
489 | retString = new String(warcContent, "UTF-8");
490 | } catch (UnsupportedEncodingException ex) {
491 | retString=new String(warcContent);
492 | }
493 | return retString;
494 | }
495 |
496 | public String getHeaderRecordType() {
497 | return warcHeader.recordType;
498 | }
499 |
500 | @Override
501 | public String toString() {
502 | StringBuffer retBuffer=new StringBuffer();
503 | retBuffer.append(warcHeader.toString());
504 | retBuffer.append(LINE_ENDING);
505 | retBuffer.append(new String(warcContent));
506 | return retBuffer.toString();
507 | }
508 |
509 | public String getHeaderString() {
510 | return warcHeader.toString();
511 | }
512 |
513 | public void write(DataOutput out) throws IOException {
514 | warcHeader.write(out);
515 | out.write(warcContent);
516 | }
517 |
518 | public void readFields(DataInput in) throws IOException {
519 | warcHeader.readFields(in);
520 | int contentLengthBytes=warcHeader.contentLength;
521 | warcContent=new byte[contentLengthBytes];
522 | in.readFully(warcContent);
523 | }
524 |
525 | }
526 |
527 |
--------------------------------------------------------------------------------
/src/main/java/edu/cmu/lemurproject/WritableWarcRecord.java:
--------------------------------------------------------------------------------
1 | /**
2 | * An extension of the Writable object for Hadoop for a Warc Record
3 | *
4 | * (C) 2009 - Carnegie Mellon University
5 | *
6 | * 1. Redistributions of this source code must retain the above copyright
7 | * notice, this list of conditions and the following disclaimer.
8 | * 2. The names "Lemur", "Indri", "University of Massachusetts",
9 | * "Carnegie Mellon", and "lemurproject" must not be used to
10 | * endorse or promote products derived from this software without
11 | * prior written permission. To obtain permission, contact
12 | * license@lemurproject.org.
13 | *
14 | * 4. Products derived from this software may not be called "Lemur" or "Indri"
15 | * nor may "Lemur" or "Indri" appear in their names without prior written
16 | * permission of The Lemur Project. To obtain permission,
17 | * contact license@lemurproject.org.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
20 | * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
21 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
23 | * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
24 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | * POSSIBILITY OF SUCH DAMAGE.
31 | *
32 | * @author mhoy@cs.cmu.edu (Mark J. Hoy)
33 | */
34 |
35 | package edu.cmu.lemurproject;
36 |
37 | import edu.cmu.lemurproject.WarcRecord;
38 | import java.io.DataInput;
39 | import java.io.DataOutput;
40 | import java.io.IOException;
41 | import org.apache.hadoop.io.Writable;
42 |
43 | public class WritableWarcRecord implements Writable {
44 |
45 | WarcRecord record=null;
46 |
47 | public WritableWarcRecord() {
48 | record=new WarcRecord();
49 | }
50 |
51 | public WritableWarcRecord(WarcRecord o) {
52 | record=new WarcRecord(o);
53 | }
54 |
55 | public WarcRecord getRecord() {
56 | return record;
57 | }
58 |
59 | public void setRecord(WarcRecord rec) {
60 | record=new WarcRecord(rec);
61 | }
62 |
63 | public void write(DataOutput out) throws IOException {
64 | if (record!=null) {
65 | record.write(out);
66 | }
67 | }
68 |
69 | public void readFields(DataInput in) throws IOException {
70 | if (record!=null) {
71 | record.readFields(in);
72 | }
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/IProcessWarcRecord.java:
--------------------------------------------------------------------------------
1 | package org.commoncrawl.examples.java_warc;
2 |
3 | /**
4 | * author: Mark Watson
5 | */
6 |
7 | /**
8 | * callback interface for handling WARC record data
9 | */
10 | public interface IProcessWarcRecord {
11 | public void process(String url, String content);
12 | public void done(); // called once when there is no more data to be processed
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/ReadS3Bucket.java:
--------------------------------------------------------------------------------
1 | package org.commoncrawl.examples.java_warc;
2 |
3 | import com.amazonaws.services.s3.AmazonS3;
4 | import com.amazonaws.services.s3.AmazonS3Client;
5 | import com.amazonaws.services.s3.model.GetObjectRequest;
6 | import com.amazonaws.services.s3.model.ObjectListing;
7 | import com.amazonaws.services.s3.model.S3Object;
8 | import com.amazonaws.services.s3.model.S3ObjectSummary;
9 | import edu.cmu.lemurproject.WarcHTMLResponseRecord;
10 | import edu.cmu.lemurproject.WarcRecord;
11 |
12 | import java.io.DataInputStream;
13 | import java.io.InputStream;
14 | import java.util.List;
15 | import java.util.zip.GZIPInputStream;
16 |
17 | /**
18 | * author: Mark Watson
19 | */
20 | public class ReadS3Bucket {
21 | static public void process(AmazonS3 s3, String bucketName, String prefix, int max) {
22 | int count = 0;
23 |
24 | // use a callback class for handling WARC record data:
25 | IProcessWarcRecord processor = new SampleProcessWarcRecord();
26 |
27 | ObjectListing list = s3.listObjects(bucketName, prefix);
28 |
29 | do { // reading summaries code derived from stackoverflow example posted by Alberto A. Medina:
30 |
31 | List summaries = list.getObjectSummaries();
32 | for (S3ObjectSummary summary : summaries) {
33 | try {
34 | String key = summary.getKey();
35 | System.out.println("+ key: " + key);
36 | S3Object object = s3.getObject(new GetObjectRequest(bucketName, key));
37 | InputStream objectData = object.getObjectContent();
38 | GZIPInputStream gzInputStream=new GZIPInputStream(objectData);
39 | DataInputStream inStream = new DataInputStream(gzInputStream);
40 |
41 | WarcRecord thisWarcRecord;
42 | while ((thisWarcRecord = WarcRecord.readNextWarcRecord(inStream)) != null) {
43 | //System.out.println("-- thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType());
44 | if (thisWarcRecord.getHeaderRecordType().equals("response")) {
45 | WarcHTMLResponseRecord htmlRecord = new WarcHTMLResponseRecord(thisWarcRecord);
46 | String thisTargetURI = htmlRecord.getTargetURI();
47 | String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
48 | // handle WARC record content:
49 | processor.process(thisTargetURI, thisContentUtf8);
50 | }
51 | }
52 | inStream.close();
53 | } catch (Exception ex) {
54 | ex.printStackTrace();
55 | }
56 | if (++count >= max) return;
57 | }
58 | list = s3.listNextBatchOfObjects(list);
59 | } while (list.isTruncated());
60 | // done processing all WARC records:
61 | processor.done();
62 |
63 | }
64 |
65 | static public void main(String[] args) {
66 | AmazonS3Client s3 = new AmazonS3Client();
67 | process(s3, "commoncrawl", "crawl-data/CC-MAIN-2013-48", 20);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/ReadWARC.java:
--------------------------------------------------------------------------------
1 | // based on an example from http://boston.lti.cs.cmu.edu/clueweb09/wiki/tiki-index.php?page=Working+with+WARC+Files
2 |
3 | package org.commoncrawl.examples.java_warc;
4 |
5 | import java.io.DataInputStream;
6 | import java.io.FileInputStream;
7 | import java.io.IOException;
8 | import java.util.zip.GZIPInputStream;
9 | import edu.cmu.lemurproject.WarcRecord;
10 | import edu.cmu.lemurproject.WarcHTMLResponseRecord;
11 |
12 | public class ReadWARC {
13 |
14 | public static void main(String[] args) throws IOException {
15 |
16 | // use a callback class for handling WARC record data:
17 | IProcessWarcRecord processor = new SampleProcessWarcRecord();
18 |
19 | String inputWarcFile="CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz";
20 | GZIPInputStream gzInputStream=new GZIPInputStream(new FileInputStream(inputWarcFile));
21 | DataInputStream inStream=new DataInputStream(gzInputStream);
22 |
23 | WarcRecord thisWarcRecord;
24 | while ((thisWarcRecord=WarcRecord.readNextWarcRecord(inStream))!=null) {
25 | System.out.println("%% thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType());
26 | if (thisWarcRecord.getHeaderRecordType().equals("response")) {
27 | WarcHTMLResponseRecord htmlRecord=new WarcHTMLResponseRecord(thisWarcRecord);
28 | String thisTargetURI=htmlRecord.getTargetURI();
29 | String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
30 |
31 | // handle WARC record content:
32 | processor.process(thisTargetURI, thisContentUtf8);
33 | }
34 | }
35 | inStream.close();
36 | // done processing all WARC records:
37 | processor.done();
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/examples/java_warc/SampleProcessWarcRecord.java:
--------------------------------------------------------------------------------
1 | package org.commoncrawl.examples.java_warc;
2 |
3 | /**
4 | * author: Mark Watson
5 | */
6 |
7 | /**
8 | * a sample callback class for handling WARC record data by implementing IProcessWarcRecord interface
9 | */
10 | public class SampleProcessWarcRecord implements IProcessWarcRecord {
11 | @Override
12 | public void process(String url, String content) {
13 | System.out.println("url: " + url);
14 | System.out.println("content: " + url + "\n\n" + content + "\n");
15 | }
16 |
17 | @Override
18 | public void done() {
19 | // place any code hear to save data, etc.
20 | }
21 | }
22 |
--------------------------------------------------------------------------------