├── .gitignore
├── README-Amazon-AMI
├── VERSION
├── bin
├── ccCopyToHDFS
├── ccListInvalidSegments
└── ccRunExample
├── build.properties
├── build.xml
├── conf
└── mapred.xml
├── lib
├── gson-2.2.1.jar
├── guava-12.0.jar
├── httpcore-4.2.1.jar
└── jsoup-1.6.3.jar
├── src
├── java
│ └── org
│ │ └── commoncrawl
│ │ ├── compressors
│ │ ├── CompressorInputStream.java
│ │ └── gzip
│ │ │ └── GzipCompressorInputStream.java
│ │ ├── examples
│ │ ├── ExampleArcMicroformat.java
│ │ ├── ExampleMetadataDomainPageCount.java
│ │ ├── ExampleMetadataStats.java
│ │ └── ExampleTextWordCount.java
│ │ ├── hadoop
│ │ └── mapred
│ │ │ ├── ArcInputFormat.java
│ │ │ ├── ArcRecord.java
│ │ │ └── ArcRecordReader.java
│ │ └── nutch
│ │ └── tools
│ │ └── arc
│ │ ├── ArcInputFormat.java
│ │ └── ArcRecordReader.java
└── ruby
│ ├── ExampleArcParseMap.rb
│ ├── ExampleArcParseReduce.rb
│ └── README
└── test
└── java
└── org
└── commoncrawl
└── hadoop
└── mapred
└── TestArcRecordCC.java
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | build-test
3 | dist
4 | output
5 |
--------------------------------------------------------------------------------
/README-Amazon-AMI:
--------------------------------------------------------------------------------
1 | Common Crawl Quick Start Amazon AMI
2 | -----------------------------------
3 |
4 | Welcome to the Common Crawl Quick Start Amazon AMI!
5 |
6 | The Common Crawl corpus is a copy of billions of web documents and their
7 | metadata, stored as an Amazon S3 Public Dataset and available for analysis.
8 |
9 | Here are the steps you need to follow to run your first job against the
10 | Common Crawl corpus:
11 |
12 | 1. Find your Amazon Access Credentials (Amazon Access ID & Amazon Secret Key)
13 | and save as two lines in this file:
14 |
15 | /home/ec2-user/.awssecret
16 |
17 | For example:
18 |
19 | JLASKHJFLKDHJLFKSJDF
20 | DFHSDJHhhoiaGKHDFa6sd42rwuhfapgfuAGSDAjh
21 |
22 | Change the permissions of this file to read/write only by 'ec2-user':
23 |
24 | chmod 600 /home/ec2-user/.awssecret
25 |
26 | Now you can use Tim Kay's AWS Command Line tool. Try this:
27 |
28 | aws ls -1 aws-publicdatasets/common-crawl/parse-output/segment/1341690167474/metadata-
29 |
30 | If you are planning on using the local Hadoop cluster, you should also consider
31 | setting these properties in /etc/hadoop/hadoop-site.xml:
32 |
33 | fs.s3n.awsAccessKeyId
34 | fs.s3n.awsSecretAccessKey
35 |
36 | 2. Move to the 'commoncrawl-examples' directory. Make sure it is up-to-date:
37 |
38 | cd ~/commoncrawl-examples; git pull
39 |
40 | 3. Compile the latest example code:
41 |
42 | ant
43 |
44 | 4. Run an example! Decide whether you want to run an example on the small local
45 | Hadoop instance or on Amazon Elastic MapReduce.
46 |
47 | Run this command to see your options:
48 |
49 | bin/ccRunExample
50 |
51 | then go ahead and run an example:
52 |
53 | bin/ccRunExample LocalHadoop ExampleMetadataDomainPageCount
54 |
55 | then look at the code:
56 |
57 | nano src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java
58 |
59 | Note: You need to have your own Amazon S3 bucket to run Amazon Elastic
60 | MapReduce jobs.
61 |
62 | -----------------------------------
63 |
64 | You can read all of this again in $HOME/commoncrawl-examples/README-Amazon-AMI.
65 |
66 | Have fun!
67 |
68 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.1
2 |
--------------------------------------------------------------------------------
/bin/ccCopyToHDFS:
--------------------------------------------------------------------------------
1 | #!/bin/bash -aeu
2 |
3 | usage() {
4 | echo ""
5 | echo "$(basename $0) ( Save To Path [ # of Files to Download ] )"
6 | echo ""
7 | echo "i.e. $(basename $0) hdfs://localhost/common-crawl 25"
8 | echo ""
9 | exit 1
10 | }
11 |
12 | echo
13 | echo "-----------------------------------------------------------------"
14 | echo "* "
15 | echo "* Common Crawl Data Downloader"
16 | echo "* "
17 | echo "-----------------------------------------------------------------"
18 |
19 | if [ ! -r ~/.awssecret ]; then
20 | echo ""
21 | echo "ERROR: Please create a readable '.awssecret' file in your home directory."
22 | echo ""
23 | echo "The first line should be your AWS Access ID."
24 | echo ""
25 | echo "The second line should be your AWS Secret Key."
26 | echo ""
27 | exit 1
28 | fi
29 |
30 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret)
31 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret)
32 |
33 | CC_PATH="s3n://aws-publicdatasets/common-crawl/parse-output"
34 |
35 | if [ $# -le 0 ]; then
36 | usage
37 | exit 0
38 | fi
39 |
40 | if [ $# -ge 1 ]; then
41 | OUTPUT_PATH="$1"
42 | fi
43 |
44 | if [ $# -ge 2 ]; then
45 | FILE_LIMIT="$2"
46 | FILE_LIMIT_PARAM="-filelimit $2"
47 | else
48 | FILE_LIMIT="-1"
49 | FILE_LIMIT_PARAM=""
50 | fi
51 |
52 | echo "INFO: Downloading list of valid segments"
53 | rm -f /tmp/cc-valid.txt
54 |
55 | hadoop fs -get ${CC_PATH}/valid_segments.txt /tmp/cc-valid.txt
56 |
57 | if [ ! -s /tmp/cc-valid.txt ]; then
58 | echo "ERROR: Unable to download valid segments list"
59 | exit 1
60 | fi
61 |
62 | while read SEGMENT_ID; do
63 | SOURCE_PATH="${CC_PATH}/segment/${SEGMENT_ID}"
64 | TARGET_PATH="${OUTPUT_PATH}/segment/${SEGMENT_ID}"
65 | echo "INFO: Running copy command for segment ${SEGMENT_ID}"
66 | echo "
67 | hadoop distcp \\
68 | -Dfs.s3n.awsAccessKeyId=\"**********\" -Dfs.s3n.awsSecretAccessKey=\"**********\" \\
69 | -i ${FILE_LIMIT_PARAM} \\
70 | ${SOURCE_PATH} \\
71 | ${TARGET_PATH}
72 | "
73 | hadoop distcp \
74 | -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \
75 | -i ${FILE_LIMIT_PARAM} \
76 | ${SOURCE_PATH} \
77 | ${TARGET_PATH}
78 |
79 | if [ ${FILE_LIMIT} -gt 0 ]; then
80 | break
81 | fi
82 |
83 | done < /tmp/cc-valid.txt
84 |
85 |
--------------------------------------------------------------------------------
/bin/ccListInvalidSegments:
--------------------------------------------------------------------------------
1 | #!/bin/bash -aeu
2 |
3 | echo ""
4 | echo "> gathering valid segments"
5 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/valid_segments | cut -d" " -f 17- | sort > /tmp/cc-valid.txt
6 | sed -i "s/valid_segments/segment/" /tmp/cc-valid.txt
7 |
8 | echo "> gathering all segments published"
9 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/segment | cut -d" " -f 17- | sort > /tmp/cc-all.txt
10 | echo ""
11 |
12 | echo "* "
13 | echo "* List of Invalid Segments"
14 | echo "* "
15 | diff -b -w /tmp/cc-all.txt /tmp/cc-valid.txt | fgrep "segment" | sed "s/< /hadoop fs -rmr s3n:\/\/aws-publicdatasets/"
16 |
17 | #rm -f /tmp/cc-all.txt
18 | #rm -f /tmp/cc-valid.txt
19 |
20 |
--------------------------------------------------------------------------------
/bin/ccRunExample:
--------------------------------------------------------------------------------
1 | #!/bin/bash -aeu
2 |
3 | BASE_PATH=`dirname $0`"/.."
4 | BASE_PATH=`cd ${BASE_PATH}; pwd`
5 |
6 | VERSION="$(cat ${BASE_PATH}/VERSION)"
7 |
8 | HDFS_LOCAL_HOSTNAME="localhost"
9 | MAIN_JAR="commoncrawl-examples-${VERSION}.jar"
10 | EXAMPLES_PATH="src/java/org/commoncrawl/examples"
11 | EXAMPLES_PKG="org.commoncrawl.examples"
12 |
13 | LOCAL_JAR_PATH="${BASE_PATH}/dist/lib"
14 |
15 | usage() {
16 | echo ""
17 | echo "$(basename $0) [ LocalHadoop | AmazonEMR ] [ ExampleName ] ( S3Bucket )"
18 | echo ""
19 | echo "Please pass in one of the following examples: "
20 | echo ""
21 | ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//'
22 | echo ""
23 | exit 1
24 | }
25 |
26 | echo
27 | echo "-----------------------------------------------------------------"
28 | echo "* "
29 | echo "* Common Crawl Example Library Runner"
30 | echo "* "
31 | echo "-----------------------------------------------------------------"
32 |
33 | if [ ! -r ~/.awssecret ]; then
34 | echo ""
35 | echo "ERROR: Please create a readable '.awssecret' file in your home directory."
36 | echo ""
37 | echo "The first line should be your AWS Access ID."
38 | echo ""
39 | echo "The second line should be your AWS Secret Key."
40 | echo ""
41 | exit 1
42 | fi
43 |
44 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret)
45 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret)
46 |
47 | if [ ! -e ${LOCAL_JAR_PATH}/${MAIN_JAR} ]; then
48 | echo ""
49 | echo "ERROR: Please run the command 'ant' to build '${MAIN_JAR}' before attempting to run an example."
50 | echo ""
51 | exit 1
52 | fi
53 |
54 | # run the example provided on the command line
55 | if [ $# -lt 2 ]; then
56 | usage
57 | fi
58 |
59 | RUN_TYPE="$1"
60 | EXAMPLE="$2"
61 |
62 | # run the selected example
63 | if [ ! -f ${BASE_PATH}/${EXAMPLES_PATH}/${EXAMPLE}.java ]; then
64 | echo ""
65 | echo "ERROR: Cannot run example '${EXAMPLE}' - not found."
66 | echo ""
67 | echo "Please run one of the following:"
68 | echo ""
69 | ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//'
70 | echo ""
71 | exit 1
72 | fi
73 |
74 | if [ "${RUN_TYPE}" = "AmazonEMR" ]; then
75 |
76 | if [ $# -lt 3 ]; then
77 | echo ""
78 | echo "ERROR: To run an Amazon Elastic MapReduce job, you must supply an S3 bucket "
79 | echo " that you have permissions to write files to."
80 | echo ""
81 | usage
82 | fi
83 |
84 | S3_USER_BUCKET="$3"
85 |
86 | EMR_JAR_PATH="${S3_USER_BUCKET}/emr/jars"
87 | EMR_LOG_PATH="${S3_USER_BUCKET}/emr/logs"
88 | EMR_OUTPUT_PATH="${S3_USER_BUCKET}/emr/output/${EXAMPLE}"
89 |
90 | echo "* "
91 | echo "* Uploading JAR + Config to S3 '${EMR_JAR_PATH}'"
92 | echo "* "
93 | echo aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR}
94 | aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR}
95 | echo ""
96 |
97 | LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv"
98 |
99 | # We've found that a single, high-memory instance works well for the master,
100 | # which runs the JobTracker
101 | MASTER_TYPE="m1.large" # consider using MASTER_TYPE="m2.4xlarge"
102 | CORE_TYPE="m1.large" # consider using CORE_TYPE="m2.2xlarge"
103 |
104 | # We've found the 'c1.xlarge' instance type to be most efficient for EMR
105 | # jobs - though we are open to suggestions!
106 | TASK_TYPE="c1.xlarge" # EMR = +$0.12 per instance hour
107 |
108 | INSTANCES=4
109 |
110 | BID="0.08"
111 |
112 | TIMESTAMP=$(date +%Y%m%d_%H%M%S)
113 | JOBNAME="Common_Crawl_${EXAMPLE}__${TIMESTAMP}"
114 |
115 | echo "-----------------------------------------------------------------"
116 | echo "* "
117 | echo "* Running Example '${EXAMPLE}'"
118 | echo "* "
119 | echo "* Starting Amazon Elastic MapReduce Job"
120 | echo "* "
121 | echo "-----------------------------------------------------------------"
122 |
123 | # Add in this option to specify a certain number of reducers:
124 | #
125 | # --arg "-Dmapred.reduce.tasks=${REDUCERS}" \
126 | #
127 |
128 | # if the line breaks don't work, join the following lines and remove all '\'
129 | echo \
130 | /opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \
131 | --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \
132 | --log-uri "s3n://${EMR_LOG_PATH}" \
133 | --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \
134 | --access-id "********" --private-key "********" \
135 | --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \
136 | --arg "-Dmapred.max.map.failures.percent=50" \
137 | --arg "s3n://${EMR_OUTPUT_PATH}" \
138 | --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \
139 | --instance-group core --instance-type "${CORE_TYPE}" --instance-count 1 \
140 | --instance-group task --instance-type "${TASK_TYPE}" --instance-count ${INSTANCES} --bid-price ${BID}
141 | echo ""
142 |
143 | set +e
144 |
145 | THIS_PID=$$
146 |
147 | EMR_JOB_ID=$(/opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \
148 | --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \
149 | --log-uri "s3n://${EMR_LOG_PATH}" \
150 | --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \
151 | --access-id "${AWS_ACCESS_ID}" --private-key "${AWS_SECRET_KEY}" \
152 | --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \
153 | --arg "-Dmapred.max.map.failures.percent=50" \
154 | --arg "s3n://${EMR_OUTPUT_PATH}" \
155 | --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \
156 | --instance-group core --instance-type "${CORE_TYPE}" --instance-count 1 \
157 | --instance-group task --instance-type "${TASK_TYPE}" --instance-count ${INSTANCES} --bid-price ${BID})
158 |
159 | RC=$?
160 |
161 | set -e
162 |
163 | if [ $RC -ne 0 ]; then
164 | echo "WARNING: Amazon EMR returned non-zero status code: $RC"
165 | fi
166 |
167 | if [ -z "${EMR_JOB_ID}" ]; then
168 | echo "WARNING: Unable to determine EMR Job ID"
169 | EMR_JOB_ID="[Amazon EMR Job ID]"
170 | fi
171 |
172 | echo ""
173 | echo "-----------------------------------------------------------------"
174 | echo "* "
175 | echo "* Your Amazon Elastic MapReduce job has been launched. "
176 | echo "* "
177 | echo "* Please look for '${JOBNAME}'"
178 | echo "* in your AWS Web Console."
179 | echo "* "
180 | echo "* Once the job has completed, run the following command to view "
181 | echo "* log files: "
182 | echo "* "
183 | echo "* hadoop dfs -get s3n://${EMR_LOG_PATH}/${EMR_JOB_ID} ${BASE_PATH}/logs"
184 | echo "* "
185 | echo "* and the following command to pull down the output files: "
186 | echo "* "
187 | echo "* hadoop fs -getmerge s3n://${EMR_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}"
188 | echo "* "
189 | echo "-----------------------------------------------------------------"
190 |
191 | mkdir -p ${BASE_PATH}/logs
192 |
193 | exit ${RC}
194 |
195 | fi
196 |
197 | if [ "${RUN_TYPE}" = "LocalHadoop" ]; then
198 |
199 | MAPRED_OUTPUT_PATH="hdfs://${HDFS_LOCAL_HOSTNAME}/user/${USER}/output/${EXAMPLE}"
200 | LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv"
201 |
202 | echo "* "
203 | echo "* Running Example '${EXAMPLE}'"
204 | echo "* "
205 | echo "-----------------------------------------------------------------"
206 | echo hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \
207 | ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml
208 | echo ""
209 |
210 | hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \
211 | -Dfs.s3.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3.awsSecretAccessKey="${AWS_SECRET_KEY}" \
212 | -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \
213 | ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml
214 |
215 | RC=$?
216 |
217 | if [ $RC -ne 0 ]; then
218 | echo "-----------------------------------------------------------------"
219 | echo "* "
220 | echo "* There was a problem running '${EXAMPLE}'."
221 | echo "* "
222 | echo "* Please contact 'info@commoncrawl.org'."
223 | echo "* "
224 | echo "-----------------------------------------------------------------"
225 | exit $RC
226 | fi
227 |
228 | echo "-----------------------------------------------------------------"
229 | echo "* "
230 | echo "* Your MapReduce job '${EXAMPLE}' completed successfully!"
231 | echo "* "
232 | echo "* Copying output to the local file system:"
233 | echo "* "
234 | echo
235 | rm -f ${LOCAL_OUTPUT_PATH}
236 | echo hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}
237 | hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}
238 | echo
239 | echo "* "
240 | echo "* You can see the results of your job here:"
241 | echo "* "
242 | echo "* ${LOCAL_OUTPUT_PATH}"
243 | echo "* "
244 | echo "* Here are the first 15 lines of output:"
245 | echo "* "
246 | echo "-------------------------------------------------------------"
247 | echo
248 | head -n 15 ${LOCAL_OUTPUT_PATH}
249 | echo
250 |
251 | exit 0
252 |
253 | fi
254 |
255 |
--------------------------------------------------------------------------------
/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Common Crawl Examples - Build Configuration Parameters
3 | #
4 |
5 | # Path to Hadoop libraries
6 | hadoop.path=/usr/share/hadoop
7 |
8 |
--------------------------------------------------------------------------------
/build.xml:
--------------------------------------------------------------------------------
1 |
40 | * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 41 | * files: it stops after the first member and silently ignores the rest. 42 | * It doesn't leave the read position to point to the beginning of the next 43 | * member, which makes it difficult workaround the lack of concatenation 44 | * support. 45 | *
46 | * Instead of using GZIPInputStream
, this class has its own .gz
47 | * container format decoder. The actual decompression is done with
48 | * {@link java.util.zip.Inflater}.
49 | */
50 | public class GzipCompressorInputStream extends CompressorInputStream {
51 | // Header flags
52 | // private static final int FTEXT = 0x01; // Uninteresting for us
53 | private static final int FHCRC = 0x02;
54 | private static final int FEXTRA = 0x04;
55 | private static final int FNAME = 0x08;
56 | private static final int FCOMMENT = 0x10;
57 | private static final int FRESERVED = 0xE0;
58 |
59 | // Compressed input stream, possibly wrapped in a BufferedInputStream
60 | private final InputStream in;
61 |
62 | // True if decompressing multimember streams.
63 | private final boolean decompressConcatenated;
64 |
65 | // Buffer to hold the input data
66 | private final byte[] buf = new byte[8192];
67 |
68 | // Amount of data in buf.
69 | private int bufUsed = 0;
70 |
71 | // Decompressor
72 | private Inflater inf = new Inflater(true);
73 |
74 | // CRC32 from uncompressed data
75 | private CRC32 crc = new CRC32();
76 |
77 | private int memberSize;
78 |
79 | // True once the end of a member has been reached and
80 | // 'decompressConcatenated' is false.
81 | private boolean stoppedForEndOfMember = false;
82 |
83 | // True once the end of stream has been reached.
84 | private boolean endOfStream = false;
85 |
86 | /**
87 | * Constructs a new input stream that decompresses gzip-compressed data
88 | * from the specified input stream.
89 | *
90 | * This is equivalent to
91 | * GzipCompressorInputStream(inputStream, false)
and thus
92 | * will not decompress concatenated .gz files.
93 | *
94 | * @param inputStream the InputStream from which this object should
95 | * be created of
96 | *
97 | * @throws IOException if the stream could not be created
98 | */
99 | public GzipCompressorInputStream(InputStream inputStream)
100 | throws IOException {
101 | this(inputStream, false);
102 | }
103 |
104 | /**
105 | * Constructs a new input stream that decompresses gzip-compressed data
106 | * from the specified input stream.
107 | *
108 | * If Always returns false to indicate that ARC files are not splittable. ARC files are stored in 100MB files, meaning they will be stored in at
37 | * most 3 blocks (2 blocks on Hadoop systems with 128MB block size). Creates an empty ARC record. Parses the ARC record header and payload (content) from a stream. Parses and sets the ARC record header fields. Currently, this method expects the ARC record header string to contain
156 | * the following fields, in order, separated by space:
157 | * decompressConcatenated
is {@code false}:
109 | * This decompressor might read more input than it will actually use.
110 | * If inputStream
supports mark
and
111 | * reset
, then the input position will be adjusted
112 | * so that it is right after the last byte of the compressed stream.
113 | * If mark
isn't supported, the input position will be
114 | * undefined.
115 | *
116 | * @param inputStream the InputStream from which this object should
117 | * be created of
118 | * @param decompressConcatenated
119 | * if true, decompress until the end of the input;
120 | * if false, stop after the first .gz member
121 | *
122 | * @throws IOException if the stream could not be created
123 | */
124 | public GzipCompressorInputStream(InputStream inputStream,
125 | boolean decompressConcatenated)
126 | throws IOException {
127 | // Mark support is strictly needed for concatenated files only,
128 | // but it's simpler if it is always available.
129 | if (inputStream.markSupported()) {
130 | in = inputStream;
131 | } else {
132 | in = new BufferedInputStream(inputStream);
133 | }
134 |
135 | this.decompressConcatenated = decompressConcatenated;
136 | init(true);
137 | }
138 |
139 | private boolean init(boolean isFirstMember) throws IOException {
140 | assert isFirstMember || decompressConcatenated;
141 |
142 | // Check the magic bytes without a possibility of EOFException.
143 | int magic0 = in.read();
144 | int magic1 = in.read();
145 |
146 | // If end of input was reached after decompressing at least
147 | // one .gz member, we have reached the end of the file successfully.
148 | if (magic0 == -1 && !isFirstMember) {
149 | endOfStream = true;
150 | return false;
151 | }
152 |
153 | if (magic0 != 31 || magic1 != 139) {
154 | throw new IOException(isFirstMember
155 | ? "Input is not in the .gz format"
156 | : "Garbage after a valid .gz stream");
157 | }
158 |
159 | // Parsing the rest of the header may throw EOFException.
160 | DataInputStream inData = new DataInputStream(in);
161 | int method = inData.readUnsignedByte();
162 | if (method != 8) {
163 | throw new IOException("Unsupported compression method "
164 | + method + " in the .gz header");
165 | }
166 |
167 | int flg = inData.readUnsignedByte();
168 | if ((flg & FRESERVED) != 0) {
169 | throw new IOException(
170 | "Reserved flags are set in the .gz header");
171 | }
172 |
173 | inData.readInt(); // mtime, ignored
174 | inData.readUnsignedByte(); // extra flags, ignored
175 | inData.readUnsignedByte(); // operating system, ignored
176 |
177 | // Extra field, ignored
178 | if ((flg & FEXTRA) != 0) {
179 | int xlen = inData.readUnsignedByte();
180 | xlen |= inData.readUnsignedByte() << 8;
181 |
182 | // This isn't as efficient as calling in.skip would be,
183 | // but it's lazier to handle unexpected end of input this way.
184 | // Most files don't have an extra field anyway.
185 | while (xlen-- > 0) {
186 | inData.readUnsignedByte();
187 | }
188 | }
189 |
190 | // Original file name, ignored
191 | if ((flg & FNAME) != 0) {
192 | readToNull(inData);
193 | }
194 |
195 | // Comment, ignored
196 | if ((flg & FCOMMENT) != 0) {
197 | readToNull(inData);
198 | }
199 |
200 | // Header "CRC16" which is actually a truncated CRC32 (which isn't
201 | // as good as real CRC16). I don't know if any encoder implementation
202 | // sets this, so it's not worth trying to verify it. GNU gzip 1.4
203 | // doesn't support this field, but zlib seems to be able to at least
204 | // skip over it.
205 | if ((flg & FHCRC) != 0) {
206 | inData.readShort();
207 | }
208 |
209 | // Reset
210 | inf.reset();
211 | crc.reset();
212 | memberSize = 0;
213 |
214 | return true;
215 | }
216 |
217 | private void readToNull(DataInputStream inData) throws IOException {
218 | while (inData.readUnsignedByte() != 0x00) {}
219 | }
220 |
221 | /** {@inheritDoc} */
222 | @Override
223 | public int read() throws IOException {
224 | byte[] buf = new byte[1];
225 | return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
226 | }
227 |
228 | /**
229 | * {@inheritDoc}
230 | *
231 | * @since 1.1
232 | */
233 | @Override
234 | public int read(byte[] b, int off, int len) throws IOException {
235 |
236 | if (stoppedForEndOfMember || endOfStream) {
237 | return -1;
238 | }
239 |
240 | int size = 0;
241 |
242 | while (len > 0) {
243 | if (inf.needsInput()) {
244 | // Remember the current position because we may need to
245 | // rewind after reading too much input.
246 | in.mark(buf.length);
247 |
248 | bufUsed = in.read(buf);
249 | if (bufUsed == -1) {
250 | throw new EOFException();
251 | }
252 |
253 | inf.setInput(buf, 0, bufUsed);
254 | }
255 |
256 | int ret;
257 | try {
258 | ret = inf.inflate(b, off, len);
259 | } catch (DataFormatException e) {
260 | throw new IOException("Gzip-compressed data is corrupt");
261 | }
262 |
263 | crc.update(b, off, ret);
264 | memberSize += ret;
265 | off += ret;
266 | len -= ret;
267 | size += ret;
268 | count(ret);
269 |
270 | if (inf.finished()) {
271 | // We may have read too many bytes. Rewind the read
272 | // position to match the actual amount used.
273 | //
274 | // NOTE: The "if" is there just in case. Since we used
275 | // in.mark earler, it should always skip enough.
276 | in.reset();
277 |
278 | int skipAmount = bufUsed - inf.getRemaining();
279 | if (in.skip(skipAmount) != skipAmount) {
280 | throw new IOException();
281 | }
282 |
283 | bufUsed = 0;
284 |
285 | DataInputStream inData = new DataInputStream(in);
286 |
287 | // CRC32
288 | long crcStored = 0;
289 | for (int i = 0; i < 4; ++i) {
290 | crcStored |= (long)inData.readUnsignedByte() << (i * 8);
291 | }
292 |
293 | if (crcStored != crc.getValue()) {
294 | throw new IOException("Gzip-compressed data is corrupt "
295 | + "(CRC32 error)");
296 | }
297 |
298 | // Uncompressed size modulo 2^32 (ISIZE in the spec)
299 | int isize = 0;
300 | for (int i = 0; i < 4; ++i) {
301 | isize |= inData.readUnsignedByte() << (i * 8);
302 | }
303 |
304 | if (isize != memberSize) {
305 | throw new IOException("Gzip-compressed data is corrupt"
306 | + "(uncompressed size mismatch)");
307 | }
308 |
309 |
310 | if (!decompressConcatenated) {
311 | stoppedForEndOfMember = true;
312 | }
313 |
314 | // See if this is the end of the file.
315 | endOfStream = !init(false);
316 |
317 | if (stoppedForEndOfMember || endOfStream) {
318 | return size == 0 ? -1 : size;
319 | }
320 | }
321 | }
322 |
323 | return size;
324 | }
325 |
326 | /**
327 | * Checks if the signature matches what is expected for a .gz file.
328 | *
329 | * @param signature the bytes to check
330 | * @param length the number of bytes to check
331 | * @return true if this is a .gz stream, false otherwise
332 | *
333 | * @since 1.1
334 | */
335 | public static boolean matches(byte[] signature, int length) {
336 |
337 | if (length < 2) {
338 | return false;
339 | }
340 |
341 | if (signature[0] != 31) {
342 | return false;
343 | }
344 |
345 | if (signature[1] != -117) {
346 | return false;
347 | }
348 |
349 | return true;
350 | }
351 |
352 | /**
353 | * Closes the input stream (unless it is System.in).
354 | *
355 | * @since 1.2
356 | */
357 | @Override
358 | public void close() throws IOException {
359 | if (inf != null) {
360 | inf.end();
361 | inf = null;
362 | }
363 |
364 | if (this.in != System.in) {
365 | this.in.close();
366 | }
367 | }
368 |
369 | /**
370 | * Explicitly instructs the stream to allow an additional concatenated
371 | * member to be read.
372 | *
373 | * @since 1.x.x
374 | */
375 | public boolean nextMember() {
376 |
377 | if (endOfStream)
378 | return false;
379 |
380 | stoppedForEndOfMember = false;
381 |
382 | return true;
383 | }
384 | }
385 |
--------------------------------------------------------------------------------
/src/java/org/commoncrawl/examples/ExampleArcMicroformat.java:
--------------------------------------------------------------------------------
1 | package org.commoncrawl.examples;
2 |
3 | // Java classes
4 | import java.lang.IllegalArgumentException;
5 | import java.lang.Integer;
6 | import java.lang.Math;
7 | import java.lang.OutOfMemoryError;
8 | import java.io.BufferedReader;
9 | import java.io.ByteArrayInputStream;
10 | import java.io.DataOutputStream;
11 | import java.io.File;
12 | import java.io.FileReader;
13 | import java.io.IOException;
14 | import java.net.URI;
15 | import java.util.Arrays;
16 |
17 | // log4j classes
18 | import org.apache.log4j.Logger;
19 |
20 | // Hadoop classes
21 | import org.apache.hadoop.conf.Configured;
22 | import org.apache.hadoop.conf.Configuration;
23 | import org.apache.hadoop.fs.FSDataOutputStream;
24 | import org.apache.hadoop.fs.FileStatus;
25 | import org.apache.hadoop.fs.FileSystem;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.fs.PathFilter;
28 | import org.apache.hadoop.io.LongWritable;
29 | import org.apache.hadoop.io.Text;
30 | import org.apache.hadoop.mapred.FileInputFormat;
31 | import org.apache.hadoop.mapred.FileOutputFormat;
32 | import org.apache.hadoop.mapred.InputSplit;
33 | import org.apache.hadoop.mapred.JobClient;
34 | import org.apache.hadoop.mapred.JobConf;
35 | import org.apache.hadoop.mapred.Mapper;
36 | import org.apache.hadoop.mapred.MapReduceBase;
37 | import org.apache.hadoop.mapred.OutputCollector;
38 | import org.apache.hadoop.mapred.Reporter;
39 | import org.apache.hadoop.mapred.TextOutputFormat;
40 | import org.apache.hadoop.mapred.lib.LongSumReducer;
41 | import org.apache.hadoop.util.Progressable;
42 | import org.apache.hadoop.util.Tool;
43 | import org.apache.hadoop.util.ToolRunner;
44 |
45 | // Common Crawl classes
46 | import org.commoncrawl.hadoop.mapred.ArcInputFormat;
47 | import org.commoncrawl.hadoop.mapred.ArcRecord;
48 |
49 | // jsoup classes
50 | import org.jsoup.Jsoup;
51 | import org.jsoup.nodes.Document;
52 | import org.jsoup.nodes.Element;
53 | import org.jsoup.select.Elements;
54 |
55 | /**
56 | * An example showing how to analyze the Common Crawl ARC web content files.
57 | *
58 | * @author Chris Stephens RecordReader
for reading the arc file.
23 | *
24 | * @param split The InputSplit of the arc file to process.
25 | * @param job The job configuration.
26 | * @param reporter The progress reporter.
27 | */
28 | public RecordReader
158 | *
164 | *
For more information on the arc file format, see 166 | * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.
167 | * 168 | * @param arcRecordHeader The first line of an ARC file entry - the header 169 | * line for an ARC file item. 170 | */ 171 | public void setArcRecordHeader(String arcRecordHeader) 172 | throws IllegalArgumentException, ParseException { 173 | 174 | if (arcRecordHeader == null || arcRecordHeader.equals("")) 175 | throw new IllegalArgumentException("ARC v1 record header string is empty."); 176 | 177 | String[] metadata = arcRecordHeader.split(" "); 178 | 179 | if (metadata.length != 5) { 180 | LOG.info(" [ "+arcRecordHeader+" ] "); 181 | throw new IllegalArgumentException("ARC v1 record header must be 5 fields."); 182 | } 183 | 184 | SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss"); 185 | 186 | this._url = metadata[0]; 187 | this._ipAddress = metadata[1]; 188 | this._archiveDate = format.parse(metadata[2]); 189 | this._contentType = metadata[3]; 190 | this._contentLength = (new Integer(metadata[4])).intValue(); 191 | } 192 | 193 | /** 194 | *Reads and sets the ARC record payload from an input stream.
195 | * 196 | * @param in An input stream positioned at the start of the ARC record payload. 197 | */ 198 | public void setPayload(InputStream in) 199 | throws IllegalArgumentException, ParseException, IOException { 200 | 201 | if (in == null) 202 | throw new IllegalArgumentException("ArcRecord cannot be created from NULL/missing input stream."); 203 | 204 | int bufferSize = this._contentLength; 205 | 206 | this._payload = new byte[bufferSize]; 207 | 208 | int n = in.read(this._payload, 0, this._payload.length); 209 | 210 | if (n < this._payload.length) { 211 | LOG.warn("Expecting "+bufferSize+" bytes in ARC record payload, found "+n+" bytes. Performing array copy."); 212 | this._payload = Arrays.copyOf(this._payload, n); 213 | } 214 | 215 | // After this, we should be at the end of this GZIP member. Let the 216 | // calling function verify the position of the stream. 217 | } 218 | 219 | public void addToPayload(byte[] data) { 220 | this.addToPayload(data, data.length); 221 | } 222 | 223 | public void addToPayload(byte[] data, int length) { 224 | 225 | LOG.warn("Content Length must have been incorrect - someone needed to add more data to the payload."); 226 | 227 | if (this._payload == null) { 228 | this._payload = Arrays.copyOf(data, length); 229 | } 230 | else { 231 | int i = this._payload.length; 232 | int n = this._payload.length + length; 233 | 234 | // resize the payload buffer 235 | this._payload = Arrays.copyOf(this._payload, n); 236 | 237 | // copy in the additional data 238 | System.arraycopy(data, 0, this._payload, i, length); 239 | } 240 | } 241 | 242 | /** 243 | * {@inheritDoc} 244 | */ 245 | public String toString() { 246 | return this._url + " - " + this._archiveDate.toString() + " - " + this._contentType; 247 | } 248 | 249 | /** 250 | * {@inheritDoc} 251 | */ 252 | public void write(DataOutput out) 253 | throws IOException { 254 | 255 | // write out ARC header info 256 | out.writeUTF(this._url); 257 | out.writeUTF(this._ipAddress); 258 | out.writeUTF(this._contentType); 259 | out.writeLong(this._archiveDate.getTime()); 260 | out.writeInt(this._contentLength); 261 | 262 | // write out the payload 263 | out.writeInt(this._payload.length); 264 | out.write(this._payload, 0, this._payload.length); 265 | } 266 | 267 | /** 268 | * {@inheritDoc} 269 | */ 270 | public void readFields(DataInput in) 271 | throws IOException { 272 | 273 | // read in ARC header info 274 | this._url = in.readUTF(); 275 | this._ipAddress = in.readUTF(); 276 | this._contentType = in.readUTF(); 277 | this._archiveDate = new Date(in.readLong()); 278 | this._contentLength = in.readInt(); 279 | 280 | // read in the payload 281 | int payloadLength = in.readInt(); 282 | 283 | // resize the payload buffer if necessary 284 | if (this._payload == null || this._payload.length != payloadLength) 285 | this._payload = new byte[payloadLength]; 286 | 287 | try { 288 | in.readFully(this._payload, 0, payloadLength); 289 | } 290 | catch (EOFException ex) { 291 | throw new IOException("End of input reached before payload was fully deserialized."); 292 | } 293 | 294 | // assume that if a new payload was loaded, HTTP response will need to be reparsed. 295 | this._httpResponse = null; 296 | } 297 | 298 | /** 299 | *Returns the full ARC record payload. This is usually a complete HTTP 300 | * response.
301 | * 302 | * @return The raw ARC record content. 303 | */ 304 | public byte[] getPayload() { 305 | return this._payload; 306 | } 307 | 308 | /** 309 | *Returns the URL from the ARC record header.
310 | * 311 | * @return The URL for this entry. 312 | */ 313 | public String getURL() { 314 | return this._url; 315 | } 316 | 317 | /** 318 | *Returns the IP address from the ARC record header.
319 | * 320 | * @return The IP address for this entry. 321 | */ 322 | public String getIpAddress() { 323 | return this._ipAddress; 324 | } 325 | 326 | /** 327 | *Returns the archive date from the ARC record header.
328 | * 329 | * @return The archive date for this entry. 330 | */ 331 | public Date getArchiveDate() { 332 | return this._archiveDate; 333 | } 334 | 335 | /** 336 | *Returns the MIME content type from the ARC record header.
337 | *Note: The MIME content type in the ARC record header is not necessarily the
338 | * same as the Content-Type
HTTP header inside the content body
339 | * (if one is present).
Returns the content length from the ARC record header.
349 | *Note: The content length in the ARC record header is not necessarily the
350 | * same as the Content-Length
HTTP header inside the content body
351 | * (if one is present).
Returns the HTTP status code.
361 | *If the payload could not be parsed as an HTTP response, returns -1.
362 | *Warning: if the payload has not yet been parsed as an HTTP response, 363 | * calling this function parses the full response. Parsing is only performed 364 | * once - parsed data is retained for subsequent calls.
365 | * 366 | * @return The HTTP status code. 367 | */ 368 | public int getHttpStatusCode() 369 | throws IOException, HttpException { 370 | 371 | HttpResponse httpResponse = this.getHttpResponse(); 372 | 373 | if (httpResponse == null) 374 | return -1; 375 | 376 | return httpResponse.getStatusLine().getStatusCode(); 377 | } 378 | 379 | /** 380 | *Returns an array of HTTP headers.
381 | *If the payload could not be parsed as an HTTP response, returns null
.
Warning: if the payload has not yet been parsed as an HTTP response, 383 | * calling this function parses the full response. Parsing is only performed 384 | * once - parsed data is retained for subsequent calls.
385 | * 386 | * @return An array of HTTP headers. 387 | */ 388 | public Header[] getHttpHeaders() 389 | throws IOException, HttpException { 390 | 391 | HttpResponse httpResponse = this.getHttpResponse(); 392 | 393 | if (httpResponse == null) 394 | return null; 395 | 396 | return httpResponse.getAllHeaders(); 397 | } 398 | 399 | /** 400 | * 401 | */ 402 | public static class ByteArraySessionInputBuffer 403 | extends AbstractSessionInputBuffer { 404 | 405 | public ByteArraySessionInputBuffer(byte[] buf) { 406 | BasicHttpParams params = new BasicHttpParams(); 407 | this.init(new ByteArrayInputStream(buf), 4096, params); 408 | } 409 | 410 | public ByteArraySessionInputBuffer(byte[] buf, int offset, int length) { 411 | BasicHttpParams params = new BasicHttpParams(); 412 | this.init(new ByteArrayInputStream(buf, offset, length), 4096, params); 413 | } 414 | 415 | public boolean isDataAvailable(int timeout) { 416 | return true; 417 | } 418 | } 419 | 420 | /** 421 | *Helper function to search a byte array for CR-LF-CR-LF (the end of 422 | * HTTP headers in the payload buffer).
423 | * 424 | * @return The offset of the end of HTTP headers, after the last CRLF. 425 | */ 426 | private int _searchForCRLFCRLF(byte[] data) { 427 | 428 | final byte CR = (byte)'\r'; 429 | final byte LF = (byte)'\n'; 430 | 431 | int i; 432 | int s = 0; 433 | 434 | for (i = 0; i < data.length; i++) { 435 | 436 | if (data[i] == CR) { 437 | if (s == 0) s = 1; 438 | else if (s == 1) s = 0; 439 | else if (s == 2) s = 3; 440 | else if (s == 3) s = 0; 441 | } 442 | else if (data[i] == LF) { 443 | if (s == 0) s = 0; 444 | else if (s == 1) s = 2; 445 | else if (s == 2) s = 0; 446 | else if (s == 3) s = 4; 447 | } 448 | else { 449 | s = 0; 450 | } 451 | 452 | if (s == 4) 453 | return i + 1; 454 | } 455 | 456 | return -1; 457 | } 458 | 459 | /** 460 | *Returns an HTTP response object parsed from the ARC record payload.
461 | *
Note: The payload is parsed on-demand, but is only parsed once. The 462 | * parsed data is saved for subsequent calls.
463 | * 464 | * @return The ARC record payload as an HTTP response object. See the Apache 465 | * HttpComponents project. 466 | */ 467 | public HttpResponse getHttpResponse() 468 | throws IOException, HttpException { 469 | 470 | if (this._httpResponse != null) 471 | return this._httpResponse; 472 | 473 | if (this._payload == null) { 474 | LOG.error("Unable to parse HTTP response: Payload has not been set"); return null; 475 | } 476 | 477 | if (this._url != null && !this._url.startsWith("http://") && !this._url.startsWith("https://")) { 478 | LOG.error("Unable to parse HTTP response: URL protocol is not HTTP"); return null; 479 | } 480 | 481 | this._httpResponse = null; 482 | 483 | // Find where the HTTP headers stop 484 | int end = this._searchForCRLFCRLF(this._payload); 485 | 486 | if (end == -1) { 487 | LOG.error("Unable to parse HTTP response: End of HTTP headers not found"); return null; 488 | } 489 | 490 | // Parse the HTTP status line and headers 491 | DefaultHttpResponseParser parser = 492 | new DefaultHttpResponseParser( 493 | new ByteArraySessionInputBuffer(this._payload, 0, end), 494 | new BasicLineParser(), 495 | new DefaultHttpResponseFactory(), 496 | new BasicHttpParams() 497 | ); 498 | 499 | this._httpResponse = parser.parse(); 500 | 501 | if (this._httpResponse == null) { 502 | LOG.error("Unable to parse HTTP response"); return null; 503 | } 504 | 505 | // Set the reset of the payload as the HTTP entity. Use an InputStreamEntity 506 | // to avoid a memory copy. 507 | InputStreamEntity entity = new InputStreamEntity(new ByteArrayInputStream(this._payload, end, this._payload.length - end), this._payload.length - end); 508 | entity.setContentType(this._httpResponse.getFirstHeader("Content-Type")); 509 | entity.setContentEncoding(this._httpResponse.getFirstHeader("Content-Encoding")); 510 | this._httpResponse.setEntity(entity); 511 | 512 | return this._httpResponse; 513 | } 514 | 515 | /** 516 | *Returns a Jsoup HTML document, parsed using the Charset in the 517 | * "Content-Type" header. If the document charset cannot be found, parse is 518 | * attempted using
519 | * 520 | * @return A Jsoup parsed HTML document from the HTTP response content. 521 | */ 522 | public Document getParsedHTML() 523 | throws IOException { 524 | 525 | if (this._url == null) { 526 | LOG.error("Unable to parse HTML: URL from ARC header has not been set"); 527 | return null; 528 | } 529 | 530 | // if response has not been parsed yet, this parses it 531 | try { 532 | this.getHttpResponse(); 533 | } 534 | catch (HttpException ex) { 535 | LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null; 536 | } 537 | 538 | if (this._httpResponse == null) { 539 | LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null; 540 | } 541 | 542 | if (this._httpResponse.getEntity() == null) { 543 | LOG.error("Unable to parse HTML: No HTTP response entity found"); return null; 544 | } 545 | 546 | if (!this._contentType.toLowerCase().contains("html")) { 547 | LOG.warn("Unable to parse HTML: Content is not HTML"); return null; 548 | } 549 | 550 | String charset = null; 551 | 552 | try { 553 | // Default value returned is "text/plain" with charset of ISO-8859-1. 554 | charset = ContentType.getOrDefault(this._httpResponse.getEntity()).getCharset().name(); 555 | } 556 | catch (Throwable ex) { 557 | 558 | } 559 | 560 | // if anything goes wrong, try ISO-8859-1 561 | if (charset == null) 562 | charset = "ISO-8859-1"; 563 | 564 | // parse the content using the derived charset and the URL from the ARC header 565 | return Jsoup.parse(this._httpResponse.getEntity().getContent(), charset, this._url); 566 | } 567 | } 568 | 569 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.EOFException; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | 9 | import java.lang.Math; 10 | import java.lang.StringBuffer; 11 | import java.util.Arrays; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.FSDataInputStream; 15 | import org.apache.hadoop.fs.FileSystem; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapred.FileSplit; 19 | import org.apache.hadoop.mapred.RecordReader; 20 | 21 | import org.apache.log4j.Logger; 22 | 23 | import org.commoncrawl.compressors.gzip.GzipCompressorInputStream; 24 | 25 | /** 26 | * Reads ARC records. 27 | * 28 | * Set "io.file.buffer.size" to define the amount of data that should be 29 | * buffered from S3. 30 | */ 31 | public class ArcRecordReader 32 | implements RecordReaderRecordReader
for reading the arc file.
38 | *
39 | * @param split The InputSplit of the arc file to process.
40 | * @param job The job configuration.
41 | * @param reporter The progress reporter.
42 | */
43 | public RecordReaderThe ArcRecordReader
class provides a record reader which
41 | * reads records from arc files.
Arc files are essentially tars of gzips. Each record in an arc file is 44 | * a compressed gzip. Multiple records are concatenated together to form a 45 | * complete arc. For more information on the arc file format see 46 | * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .
47 | * 48 | *Arc files are used by the internet archive and grub projects.
49 | * 50 | * see {@link http://www.archive.org/ } 51 | * see {@link http://www.grub.org/ } 52 | */ 53 | public class ArcRecordReader 54 | implements RecordReaderReturns true if the byte array passed matches the gzip header magic 70 | * number.
71 | * 72 | * @param input The byte array to check. 73 | * 74 | * @return True if the byte array matches the gzip header magic number. 75 | */ 76 | public static boolean isMagic(byte[] input) { 77 | 78 | // check for null and incorrect length 79 | if (input == null || input.length != MAGIC.length) { 80 | return false; 81 | } 82 | 83 | // check byte by byte 84 | for (int i = 0; i < MAGIC.length; i++) { 85 | if (MAGIC[i] != input[i]) { 86 | return false; 87 | } 88 | } 89 | 90 | // must match 91 | return true; 92 | } 93 | 94 | /** 95 | * Constructor that sets the configuration and file split. 96 | * 97 | * @param conf The job configuration. 98 | * @param split The file split to read from. 99 | * 100 | * @throws IOException If an IO error occurs while initializing file split. 101 | */ 102 | public ArcRecordReader(Configuration conf, FileSplit split) 103 | throws IOException { 104 | 105 | Path path = split.getPath(); 106 | FileSystem fs = path.getFileSystem(conf); 107 | fileLen = fs.getFileStatus(split.getPath()).getLen(); 108 | this.conf = conf; 109 | this.in = fs.open(split.getPath()); 110 | this.splitStart = split.getStart(); 111 | this.splitEnd = splitStart + split.getLength(); 112 | this.splitLen = split.getLength(); 113 | in.seek(splitStart); 114 | } 115 | 116 | /** 117 | * Closes the record reader resources. 118 | */ 119 | public void close() 120 | throws IOException { 121 | this.in.close(); 122 | } 123 | 124 | /** 125 | * Creates a new instance of theText
object for the key.
126 | */
127 | public Text createKey() {
128 | return (Text)ReflectionUtils.newInstance(Text.class, conf);
129 | }
130 |
131 | /**
132 | * Creates a new instance of the BytesWritable
object for the key
133 | */
134 | public BytesWritable createValue() {
135 | return (BytesWritable)ReflectionUtils.newInstance(BytesWritable.class, conf);
136 | }
137 |
138 | /**
139 | * Returns the current position in the file.
140 | *
141 | * @return The long of the current position in the file.
142 | */
143 | public long getPos()
144 | throws IOException {
145 | return in.getPos();
146 | }
147 |
148 | /**
149 | * Returns the percentage of progress in processing the file. This will be
150 | * represented as a float from 0 to 1 with 1 being 100% completed.
151 | *
152 | * @return The percentage of progress as a float from 0 to 1.
153 | */
154 | public float getProgress()
155 | throws IOException {
156 |
157 | // if we haven't even started
158 | if (splitEnd == splitStart) {
159 | return 0.0f;
160 | }
161 | else {
162 | // the progress is current pos - where we started / length of the split
163 | return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
164 | }
165 | }
166 |
167 | /**
168 | * Returns true if the next record in the split is read into the key and 169 | * value pair. The key will be the arc record header and the values will be 170 | * the raw content bytes of the arc record.
171 | * 172 | * @param key The record key 173 | * @param value The record value 174 | * 175 | * @return True if the next record is read. 176 | * 177 | * @throws IOException If an error occurs while reading the record value. 178 | */ 179 | public boolean next(Text key, BytesWritable value) 180 | throws IOException { 181 | 182 | try { 183 | 184 | // get the starting position on the input stream 185 | long startRead = in.getPos(); 186 | byte[] magicBuffer = null; 187 | 188 | // we need this loop to handle false positives in reading of gzip records 189 | while (true) { 190 | 191 | // while we haven't passed the end of the split 192 | if (startRead >= splitEnd) { 193 | return false; 194 | } 195 | 196 | // scanning for the gzip header 197 | boolean foundStart = false; 198 | while (!foundStart) { 199 | 200 | // start at the current file position and scan for 1K at time, break 201 | // if there is no more to read 202 | startRead = in.getPos(); 203 | magicBuffer = new byte[1024]; 204 | int read = in.read(magicBuffer); 205 | if (read < 0) { 206 | break; 207 | } 208 | 209 | // scan the byte array for the gzip header magic number. This happens 210 | // byte by byte 211 | for (int i = 0; i < read - 1; i++) { 212 | byte[] testMagic = new byte[2]; 213 | System.arraycopy(magicBuffer, i, testMagic, 0, 2); 214 | if (isMagic(testMagic)) { 215 | // set the next start to the current gzip header 216 | startRead += i; 217 | foundStart = true; 218 | break; 219 | } 220 | } 221 | } 222 | 223 | // seek to the start of the gzip header 224 | in.seek(startRead); 225 | ByteArrayOutputStream baos = null; 226 | int totalRead = 0; 227 | 228 | try { 229 | 230 | // read 4K of the gzip at a time putting into a byte array 231 | byte[] buffer = new byte[4096]; 232 | GZIPInputStream zin = new GZIPInputStream(in); 233 | int gzipRead = -1; 234 | baos = new ByteArrayOutputStream(); 235 | while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) { 236 | baos.write(buffer, 0, gzipRead); 237 | totalRead += gzipRead; 238 | } 239 | } 240 | catch (Exception e) { 241 | 242 | // there are times we get false positives where the gzip header exists 243 | // but it is not an actual gzip record, so we ignore it and start 244 | // over seeking 245 | // LOG.debug("Ignoring position: " + (startRead)); 246 | if (startRead + 1 < fileLen) { 247 | in.seek(startRead + 1); 248 | } 249 | continue; 250 | } 251 | 252 | // change the output stream to a byte array 253 | byte[] content = baos.toByteArray(); 254 | 255 | // the first line of the raw content in arc files is the header 256 | int eol = 0; 257 | for (int i = 0; i < content.length; i++) { 258 | if (i > 0 && content[i] == '\n') { 259 | eol = i; 260 | break; 261 | } 262 | } 263 | 264 | // create the header and the raw content minus the header 265 | String header = new String(content, 0, eol).trim(); 266 | byte[] raw = new byte[(content.length - eol) - 1]; 267 | System.arraycopy(content, eol + 1, raw, 0, raw.length); 268 | 269 | // populate key and values with the header and raw content. 270 | Text keyText = (Text)key; 271 | keyText.set(header); 272 | BytesWritable valueBytes = (BytesWritable)value; 273 | valueBytes.set(raw, 0, raw.length); 274 | 275 | // TODO: It would be best to start at the end of the gzip read but 276 | // the bytes read in gzip don't match raw bytes in the file so we 277 | // overshoot the next header. With this current method you get 278 | // some false positives but don't miss records. 279 | if (startRead + 1 < fileLen) { 280 | in.seek(startRead + 1); 281 | } 282 | 283 | // populated the record, now return 284 | return true; 285 | } 286 | } 287 | catch (Exception e) { 288 | LOG.equals(StringUtils.stringifyException(e)); 289 | } 290 | 291 | // couldn't populate the record or there is no next record to read 292 | return false; 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseMap.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'open3' 5 | require 'uri' 6 | 7 | # Inline these classes so we don't have to copy a file while bootstrapping 8 | class ArcRecord 9 | attr_accessor :num, :url, :ip_address, :archive_date, :content_type, :content_length, :content 10 | end 11 | 12 | class ArcFile 13 | 14 | include Enumerable 15 | 16 | def initialize(input_stream) 17 | @handle=input_stream 18 | end 19 | 20 | def each 21 | return self.to_enum unless block_given? 22 | begin 23 | # See http://www.archive.org/web/researcher/ArcFileFormat.php 24 | # for information about the ARC format once it is decompressed 25 | file_header = @handle.readline.strip 26 | @handle.read(Integer(file_header.split.last)) 27 | i=1 28 | 29 | loop do 30 | begin 31 | fields = @handle.readline.strip.split(" ") 32 | raise "Invalid ARC record header found" if fields.length != 5 33 | warn("Invalid protocol in ARC record header") if not fields[0].to_s.start_with?("http://", "https://") 34 | 35 | record = ArcRecord.new 36 | record.num = i 37 | record.url = fields[0].to_s 38 | record.ip_address = fields[1].to_s 39 | record.archive_date = fields[2].to_s 40 | record.content_type = fields[3].to_s 41 | record.content_length = Integer(fields[4]) 42 | record.content = @handle.read(record.content_length) 43 | i = i+1 44 | 45 | yield record 46 | 47 | rescue EOFError 48 | break nil 49 | end 50 | end 51 | #rescue 52 | # raise "#{self.class}: Record ##{i} - Error - #{$!}" 53 | end 54 | end 55 | 56 | end 57 | 58 | CHUNKSIZE=1024*1024 59 | 60 | # All warnings will end up in the EMR stderr logs. 61 | warn("Starting up GZIP process, piping #{CHUNKSIZE/1024}KB chunks at a time") 62 | 63 | # Ruby GzipReader is unable to unzip these files, but unix gunzip can 64 | # Also means we don't need to eat much RAM, because everything is streaming. 65 | Open3.popen3('gunzip -c') {|sin,sout,serr,thr| 66 | 67 | # Create an ArcFile instance which will receive gunzip's stdout 68 | arcfile = ArcFile.new(sout) 69 | 70 | Thread.new do 71 | loop do 72 | begin 73 | chunk = STDIN.readpartial(CHUNKSIZE) 74 | sin.write(chunk) 75 | Thread.pass() 76 | rescue EOFError 77 | warn("End of input, flushing and closing stream to GZIP") 78 | sin.close() # which will send an EOF to the ArcFile 79 | break nil 80 | end 81 | end 82 | end 83 | 84 | # Now we have a lazy ArcFile that we can treat as an Enumerable. 85 | arcfile.each {|record| 86 | if record 87 | begin 88 | # work around Ruby URI library's lack of support for URLs with underscore 89 | uri = URI.parse(record.url.delete("_")) 90 | STDOUT.puts(uri.host.downcase()) 91 | rescue URI::InvalidURIError 92 | warn("ARC file contains invalid URL: "+record.url) 93 | next 94 | end 95 | end 96 | } 97 | } 98 | 99 | -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseReduce.rb: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env ruby 2 | 3 | curr = nil 4 | sum = 0 5 | 6 | ARGF.each do |line| 7 | 8 | # the entire line is the key 9 | key = line.chomp 10 | 11 | # if the current key hasn't been set yet, set it 12 | if !curr 13 | 14 | curr = key 15 | sum = 0 16 | 17 | # if a new key is found, emit the current key ... 18 | elsif key != curr && sum > 0 19 | 20 | if sum > 2 21 | STDOUT.puts(curr + "\t" + sum.to_s()) 22 | end 23 | 24 | # ... then set up a new key 25 | curr = key 26 | sum = 0 27 | 28 | end 29 | 30 | # add to count for this current key 31 | sum += 1 32 | 33 | end 34 | -------------------------------------------------------------------------------- /src/ruby/README: -------------------------------------------------------------------------------- 1 | common_crawl_types 2 | 3 | Ben Nagy wrote the original code for this project, and posted it inline to the 4 | Common Crawl mailing list. I tidied it up and wrote a how-to guide: 5 | http://petewarden.typepad.com/searchbrowser/2012/03/twelve-steps-to-running-your-ruby-code-across-five-billion-web-pages.html 6 | 7 | Ben's original message is below. 8 | 9 | Pete Warden, pete@petewarden.com 10 | 11 | ------------------------------------------------------------------- 12 | 13 | Hi, 14 | 15 | So I found this a bit of a pain, so I thought I'd share. If you want 16 | to mess with the Common Crawl stuff but don't feel like learning Java, 17 | this might be for you. 18 | 19 | I'm sure that this could be easily adapted for other streaming 20 | languages, once you work out how to read requester-pays buckets. 21 | 22 | First up, see this: 23 | http://arfon.org/getting-started-with-elastic-mapreduce-and-hadoop-streaming 24 | 25 | Which has basic information and nice screenshots about EMR Streaming, 26 | setting up the job, bootstrapping and such. 27 | 28 | To install the AWS Ruby SDK on an EMR instance you'll need to 29 | bootstrap some stuff. Some of the packages might not be necessary, but 30 | it was a bit of a pain to trim down from a working set of basic 31 | packages. 32 | 33 | (see setup.sh) 34 | 35 | OK, now we're ready for the mapper. This example just collects 36 | mimetypes and URL extensions. The key bits are the ArcFile class and 37 | the monkeypatch to make requester-pays work. I'm not particularly 38 | proud of this monkeypatch, by the way, but the SDK code is a bit 39 | baffling, and it looked like too much work to patch it properly. 40 | 41 | This mapper expects a file manifest as input, one arc.gz url to read 42 | per line. By doing this you avoid the problem of weird splits, or 43 | having hadoop automatically trying to gunzip the file and failing. It 44 | should look like: 45 | 46 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380159663_9.arc.gz 47 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380179515_9.arc.gz 48 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380199363_9.arc.gz 49 | 50 | You can get those names with the SDK, once you add the monkeypatch 51 | below, or with a patched version of s3cmd ls, the instructions for 52 | which have been posted here before. 53 | 54 | (see extension_map.rb) 55 | 56 | And finally, a trivial reducer 57 | 58 | (see extension_reduce.rb) 59 | 60 | IMHO you only need one of these puppies, which you can achieve by 61 | adding '-D mapred.reduce.tasks=1' to your job args 62 | 63 | If it all worked you should get something like this in your output 64 | directory: 65 | 66 | text/html : 4365 67 | text/html .html : 4256 68 | text/xml : 43 69 | text/html .aspx : 16 70 | text/html .com : 2 71 | text/plain .txt : 1 72 | 73 | Except with more entries, that is just an example based on one file. 74 | 75 | For those interested in costs / timings, I finished 2010/9/24/9 (790 76 | files) in 5h57m, or 30 normalised instance hours of m1.small, with 1 77 | master and 4 core instances. The same job with 1 m1.small master and 78 | 2x cc1.4xlarge core was done in 1h31m, for 66 normalised instance 79 | hours. I'll let you do your individual maths and avoid drawing any 80 | conclusions. If anyone has additional (solid) performance data 81 | comparing various cluster configs for identical workloads then that 82 | might be useful. As an aside, my map tasks took from 9 minutes to 45 83 | minutes to complete, but the average was probably ~33 (eyeball). 84 | 85 | Anyway, hope this helps someone. 86 | 87 | Cheers, 88 | 89 | ben 90 | -------------------------------------------------------------------------------- /test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.InputStream; 5 | import java.io.IOException; 6 | import java.lang.StringBuilder; 7 | 8 | import junit.framework.TestCase; 9 | import static junit.framework.Assert.*; 10 | 11 | /** 12 | * Unit Tests for jUnit 3.8 13 | */ 14 | public class TestArcRecordCC extends TestCase { 15 | 16 | ArcRecordCC r; 17 | 18 | /* 19 | public static junit.framework.Test suite() { 20 | return new junit.framework.JUnit4TestAdapter(TestArcRecordCC.class); 21 | } 22 | */ 23 | 24 | public InputStream getPayload1() 25 | throws Exception { 26 | 27 | StringBuilder s = new StringBuilder(); 28 | 29 | s.setLength(0); 30 | s.append("\n"); 31 | s.append(" \n"); 32 | s.append("