├── .gitignore ├── README-Amazon-AMI ├── VERSION ├── bin ├── ccCopyToHDFS ├── ccListInvalidSegments └── ccRunExample ├── build.properties ├── build.xml ├── conf └── mapred.xml ├── lib ├── gson-2.2.1.jar ├── guava-12.0.jar ├── httpcore-4.2.1.jar └── jsoup-1.6.3.jar ├── src ├── java │ └── org │ │ └── commoncrawl │ │ ├── compressors │ │ ├── CompressorInputStream.java │ │ └── gzip │ │ │ └── GzipCompressorInputStream.java │ │ ├── examples │ │ ├── ExampleArcMicroformat.java │ │ ├── ExampleMetadataDomainPageCount.java │ │ ├── ExampleMetadataStats.java │ │ └── ExampleTextWordCount.java │ │ ├── hadoop │ │ └── mapred │ │ │ ├── ArcInputFormat.java │ │ │ ├── ArcRecord.java │ │ │ └── ArcRecordReader.java │ │ └── nutch │ │ └── tools │ │ └── arc │ │ ├── ArcInputFormat.java │ │ └── ArcRecordReader.java └── ruby │ ├── ExampleArcParseMap.rb │ ├── ExampleArcParseReduce.rb │ └── README └── test └── java └── org └── commoncrawl └── hadoop └── mapred └── TestArcRecordCC.java /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | build-test 3 | dist 4 | output 5 | -------------------------------------------------------------------------------- /README-Amazon-AMI: -------------------------------------------------------------------------------- 1 | Common Crawl Quick Start Amazon AMI 2 | ----------------------------------- 3 | 4 | Welcome to the Common Crawl Quick Start Amazon AMI! 5 | 6 | The Common Crawl corpus is a copy of billions of web documents and their 7 | metadata, stored as an Amazon S3 Public Dataset and available for analysis. 8 | 9 | Here are the steps you need to follow to run your first job against the 10 | Common Crawl corpus: 11 | 12 | 1. Find your Amazon Access Credentials (Amazon Access ID & Amazon Secret Key) 13 | and save as two lines in this file: 14 | 15 | /home/ec2-user/.awssecret 16 | 17 | For example: 18 | 19 | JLASKHJFLKDHJLFKSJDF 20 | DFHSDJHhhoiaGKHDFa6sd42rwuhfapgfuAGSDAjh 21 | 22 | Change the permissions of this file to read/write only by 'ec2-user': 23 | 24 | chmod 600 /home/ec2-user/.awssecret 25 | 26 | Now you can use Tim Kay's AWS Command Line tool. Try this: 27 | 28 | aws ls -1 aws-publicdatasets/common-crawl/parse-output/segment/1341690167474/metadata- 29 | 30 | If you are planning on using the local Hadoop cluster, you should also consider 31 | setting these properties in /etc/hadoop/hadoop-site.xml: 32 | 33 | fs.s3n.awsAccessKeyId 34 | fs.s3n.awsSecretAccessKey 35 | 36 | 2. Move to the 'commoncrawl-examples' directory. Make sure it is up-to-date: 37 | 38 | cd ~/commoncrawl-examples; git pull 39 | 40 | 3. Compile the latest example code: 41 | 42 | ant 43 | 44 | 4. Run an example! Decide whether you want to run an example on the small local 45 | Hadoop instance or on Amazon Elastic MapReduce. 46 | 47 | Run this command to see your options: 48 | 49 | bin/ccRunExample 50 | 51 | then go ahead and run an example: 52 | 53 | bin/ccRunExample LocalHadoop ExampleMetadataDomainPageCount 54 | 55 | then look at the code: 56 | 57 | nano src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java 58 | 59 | Note: You need to have your own Amazon S3 bucket to run Amazon Elastic 60 | MapReduce jobs. 61 | 62 | ----------------------------------- 63 | 64 | You can read all of this again in $HOME/commoncrawl-examples/README-Amazon-AMI. 65 | 66 | Have fun! 67 | 68 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.0.1 2 | -------------------------------------------------------------------------------- /bin/ccCopyToHDFS: -------------------------------------------------------------------------------- 1 | #!/bin/bash -aeu 2 | 3 | usage() { 4 | echo "" 5 | echo "$(basename $0) ( Save To Path [ # of Files to Download ] )" 6 | echo "" 7 | echo "i.e. $(basename $0) hdfs://localhost/common-crawl 25" 8 | echo "" 9 | exit 1 10 | } 11 | 12 | echo 13 | echo "-----------------------------------------------------------------" 14 | echo "* " 15 | echo "* Common Crawl Data Downloader" 16 | echo "* " 17 | echo "-----------------------------------------------------------------" 18 | 19 | if [ ! -r ~/.awssecret ]; then 20 | echo "" 21 | echo "ERROR: Please create a readable '.awssecret' file in your home directory." 22 | echo "" 23 | echo "The first line should be your AWS Access ID." 24 | echo "" 25 | echo "The second line should be your AWS Secret Key." 26 | echo "" 27 | exit 1 28 | fi 29 | 30 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret) 31 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret) 32 | 33 | CC_PATH="s3n://aws-publicdatasets/common-crawl/parse-output" 34 | 35 | if [ $# -le 0 ]; then 36 | usage 37 | exit 0 38 | fi 39 | 40 | if [ $# -ge 1 ]; then 41 | OUTPUT_PATH="$1" 42 | fi 43 | 44 | if [ $# -ge 2 ]; then 45 | FILE_LIMIT="$2" 46 | FILE_LIMIT_PARAM="-filelimit $2" 47 | else 48 | FILE_LIMIT="-1" 49 | FILE_LIMIT_PARAM="" 50 | fi 51 | 52 | echo "INFO: Downloading list of valid segments" 53 | rm -f /tmp/cc-valid.txt 54 | 55 | hadoop fs -get ${CC_PATH}/valid_segments.txt /tmp/cc-valid.txt 56 | 57 | if [ ! -s /tmp/cc-valid.txt ]; then 58 | echo "ERROR: Unable to download valid segments list" 59 | exit 1 60 | fi 61 | 62 | while read SEGMENT_ID; do 63 | SOURCE_PATH="${CC_PATH}/segment/${SEGMENT_ID}" 64 | TARGET_PATH="${OUTPUT_PATH}/segment/${SEGMENT_ID}" 65 | echo "INFO: Running copy command for segment ${SEGMENT_ID}" 66 | echo " 67 | hadoop distcp \\ 68 | -Dfs.s3n.awsAccessKeyId=\"**********\" -Dfs.s3n.awsSecretAccessKey=\"**********\" \\ 69 | -i ${FILE_LIMIT_PARAM} \\ 70 | ${SOURCE_PATH} \\ 71 | ${TARGET_PATH} 72 | " 73 | hadoop distcp \ 74 | -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \ 75 | -i ${FILE_LIMIT_PARAM} \ 76 | ${SOURCE_PATH} \ 77 | ${TARGET_PATH} 78 | 79 | if [ ${FILE_LIMIT} -gt 0 ]; then 80 | break 81 | fi 82 | 83 | done < /tmp/cc-valid.txt 84 | 85 | -------------------------------------------------------------------------------- /bin/ccListInvalidSegments: -------------------------------------------------------------------------------- 1 | #!/bin/bash -aeu 2 | 3 | echo "" 4 | echo "> gathering valid segments" 5 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/valid_segments | cut -d" " -f 17- | sort > /tmp/cc-valid.txt 6 | sed -i "s/valid_segments/segment/" /tmp/cc-valid.txt 7 | 8 | echo "> gathering all segments published" 9 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/segment | cut -d" " -f 17- | sort > /tmp/cc-all.txt 10 | echo "" 11 | 12 | echo "* " 13 | echo "* List of Invalid Segments" 14 | echo "* " 15 | diff -b -w /tmp/cc-all.txt /tmp/cc-valid.txt | fgrep "segment" | sed "s/< /hadoop fs -rmr s3n:\/\/aws-publicdatasets/" 16 | 17 | #rm -f /tmp/cc-all.txt 18 | #rm -f /tmp/cc-valid.txt 19 | 20 | -------------------------------------------------------------------------------- /bin/ccRunExample: -------------------------------------------------------------------------------- 1 | #!/bin/bash -aeu 2 | 3 | BASE_PATH=`dirname $0`"/.." 4 | BASE_PATH=`cd ${BASE_PATH}; pwd` 5 | 6 | VERSION="$(cat ${BASE_PATH}/VERSION)" 7 | 8 | HDFS_LOCAL_HOSTNAME="localhost" 9 | MAIN_JAR="commoncrawl-examples-${VERSION}.jar" 10 | EXAMPLES_PATH="src/java/org/commoncrawl/examples" 11 | EXAMPLES_PKG="org.commoncrawl.examples" 12 | 13 | LOCAL_JAR_PATH="${BASE_PATH}/dist/lib" 14 | 15 | usage() { 16 | echo "" 17 | echo "$(basename $0) [ LocalHadoop | AmazonEMR ] [ ExampleName ] ( S3Bucket )" 18 | echo "" 19 | echo "Please pass in one of the following examples: " 20 | echo "" 21 | ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//' 22 | echo "" 23 | exit 1 24 | } 25 | 26 | echo 27 | echo "-----------------------------------------------------------------" 28 | echo "* " 29 | echo "* Common Crawl Example Library Runner" 30 | echo "* " 31 | echo "-----------------------------------------------------------------" 32 | 33 | if [ ! -r ~/.awssecret ]; then 34 | echo "" 35 | echo "ERROR: Please create a readable '.awssecret' file in your home directory." 36 | echo "" 37 | echo "The first line should be your AWS Access ID." 38 | echo "" 39 | echo "The second line should be your AWS Secret Key." 40 | echo "" 41 | exit 1 42 | fi 43 | 44 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret) 45 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret) 46 | 47 | if [ ! -e ${LOCAL_JAR_PATH}/${MAIN_JAR} ]; then 48 | echo "" 49 | echo "ERROR: Please run the command 'ant' to build '${MAIN_JAR}' before attempting to run an example." 50 | echo "" 51 | exit 1 52 | fi 53 | 54 | # run the example provided on the command line 55 | if [ $# -lt 2 ]; then 56 | usage 57 | fi 58 | 59 | RUN_TYPE="$1" 60 | EXAMPLE="$2" 61 | 62 | # run the selected example 63 | if [ ! -f ${BASE_PATH}/${EXAMPLES_PATH}/${EXAMPLE}.java ]; then 64 | echo "" 65 | echo "ERROR: Cannot run example '${EXAMPLE}' - not found." 66 | echo "" 67 | echo "Please run one of the following:" 68 | echo "" 69 | ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//' 70 | echo "" 71 | exit 1 72 | fi 73 | 74 | if [ "${RUN_TYPE}" = "AmazonEMR" ]; then 75 | 76 | if [ $# -lt 3 ]; then 77 | echo "" 78 | echo "ERROR: To run an Amazon Elastic MapReduce job, you must supply an S3 bucket " 79 | echo " that you have permissions to write files to." 80 | echo "" 81 | usage 82 | fi 83 | 84 | S3_USER_BUCKET="$3" 85 | 86 | EMR_JAR_PATH="${S3_USER_BUCKET}/emr/jars" 87 | EMR_LOG_PATH="${S3_USER_BUCKET}/emr/logs" 88 | EMR_OUTPUT_PATH="${S3_USER_BUCKET}/emr/output/${EXAMPLE}" 89 | 90 | echo "* " 91 | echo "* Uploading JAR + Config to S3 '${EMR_JAR_PATH}'" 92 | echo "* " 93 | echo aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR} 94 | aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR} 95 | echo "" 96 | 97 | LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv" 98 | 99 | # We've found that a single, high-memory instance works well for the master, 100 | # which runs the JobTracker 101 | MASTER_TYPE="m1.large" # consider using MASTER_TYPE="m2.4xlarge" 102 | CORE_TYPE="m1.large" # consider using CORE_TYPE="m2.2xlarge" 103 | 104 | # We've found the 'c1.xlarge' instance type to be most efficient for EMR 105 | # jobs - though we are open to suggestions! 106 | TASK_TYPE="c1.xlarge" # EMR = +$0.12 per instance hour 107 | 108 | INSTANCES=4 109 | 110 | BID="0.08" 111 | 112 | TIMESTAMP=$(date +%Y%m%d_%H%M%S) 113 | JOBNAME="Common_Crawl_${EXAMPLE}__${TIMESTAMP}" 114 | 115 | echo "-----------------------------------------------------------------" 116 | echo "* " 117 | echo "* Running Example '${EXAMPLE}'" 118 | echo "* " 119 | echo "* Starting Amazon Elastic MapReduce Job" 120 | echo "* " 121 | echo "-----------------------------------------------------------------" 122 | 123 | # Add in this option to specify a certain number of reducers: 124 | # 125 | # --arg "-Dmapred.reduce.tasks=${REDUCERS}" \ 126 | # 127 | 128 | # if the line breaks don't work, join the following lines and remove all '\' 129 | echo \ 130 | /opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \ 131 | --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \ 132 | --log-uri "s3n://${EMR_LOG_PATH}" \ 133 | --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \ 134 | --access-id "********" --private-key "********" \ 135 | --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \ 136 | --arg "-Dmapred.max.map.failures.percent=50" \ 137 | --arg "s3n://${EMR_OUTPUT_PATH}" \ 138 | --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \ 139 | --instance-group core --instance-type "${CORE_TYPE}" --instance-count 1 \ 140 | --instance-group task --instance-type "${TASK_TYPE}" --instance-count ${INSTANCES} --bid-price ${BID} 141 | echo "" 142 | 143 | set +e 144 | 145 | THIS_PID=$$ 146 | 147 | EMR_JOB_ID=$(/opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \ 148 | --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \ 149 | --log-uri "s3n://${EMR_LOG_PATH}" \ 150 | --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \ 151 | --access-id "${AWS_ACCESS_ID}" --private-key "${AWS_SECRET_KEY}" \ 152 | --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \ 153 | --arg "-Dmapred.max.map.failures.percent=50" \ 154 | --arg "s3n://${EMR_OUTPUT_PATH}" \ 155 | --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \ 156 | --instance-group core --instance-type "${CORE_TYPE}" --instance-count 1 \ 157 | --instance-group task --instance-type "${TASK_TYPE}" --instance-count ${INSTANCES} --bid-price ${BID}) 158 | 159 | RC=$? 160 | 161 | set -e 162 | 163 | if [ $RC -ne 0 ]; then 164 | echo "WARNING: Amazon EMR returned non-zero status code: $RC" 165 | fi 166 | 167 | if [ -z "${EMR_JOB_ID}" ]; then 168 | echo "WARNING: Unable to determine EMR Job ID" 169 | EMR_JOB_ID="[Amazon EMR Job ID]" 170 | fi 171 | 172 | echo "" 173 | echo "-----------------------------------------------------------------" 174 | echo "* " 175 | echo "* Your Amazon Elastic MapReduce job has been launched. " 176 | echo "* " 177 | echo "* Please look for '${JOBNAME}'" 178 | echo "* in your AWS Web Console." 179 | echo "* " 180 | echo "* Once the job has completed, run the following command to view " 181 | echo "* log files: " 182 | echo "* " 183 | echo "* hadoop dfs -get s3n://${EMR_LOG_PATH}/${EMR_JOB_ID} ${BASE_PATH}/logs" 184 | echo "* " 185 | echo "* and the following command to pull down the output files: " 186 | echo "* " 187 | echo "* hadoop fs -getmerge s3n://${EMR_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}" 188 | echo "* " 189 | echo "-----------------------------------------------------------------" 190 | 191 | mkdir -p ${BASE_PATH}/logs 192 | 193 | exit ${RC} 194 | 195 | fi 196 | 197 | if [ "${RUN_TYPE}" = "LocalHadoop" ]; then 198 | 199 | MAPRED_OUTPUT_PATH="hdfs://${HDFS_LOCAL_HOSTNAME}/user/${USER}/output/${EXAMPLE}" 200 | LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv" 201 | 202 | echo "* " 203 | echo "* Running Example '${EXAMPLE}'" 204 | echo "* " 205 | echo "-----------------------------------------------------------------" 206 | echo hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \ 207 | ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml 208 | echo "" 209 | 210 | hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \ 211 | -Dfs.s3.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3.awsSecretAccessKey="${AWS_SECRET_KEY}" \ 212 | -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \ 213 | ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml 214 | 215 | RC=$? 216 | 217 | if [ $RC -ne 0 ]; then 218 | echo "-----------------------------------------------------------------" 219 | echo "* " 220 | echo "* There was a problem running '${EXAMPLE}'." 221 | echo "* " 222 | echo "* Please contact 'info@commoncrawl.org'." 223 | echo "* " 224 | echo "-----------------------------------------------------------------" 225 | exit $RC 226 | fi 227 | 228 | echo "-----------------------------------------------------------------" 229 | echo "* " 230 | echo "* Your MapReduce job '${EXAMPLE}' completed successfully!" 231 | echo "* " 232 | echo "* Copying output to the local file system:" 233 | echo "* " 234 | echo 235 | rm -f ${LOCAL_OUTPUT_PATH} 236 | echo hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH} 237 | hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH} 238 | echo 239 | echo "* " 240 | echo "* You can see the results of your job here:" 241 | echo "* " 242 | echo "* ${LOCAL_OUTPUT_PATH}" 243 | echo "* " 244 | echo "* Here are the first 15 lines of output:" 245 | echo "* " 246 | echo "-------------------------------------------------------------" 247 | echo 248 | head -n 15 ${LOCAL_OUTPUT_PATH} 249 | echo 250 | 251 | exit 0 252 | 253 | fi 254 | 255 | -------------------------------------------------------------------------------- /build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Common Crawl Examples - Build Configuration Parameters 3 | # 4 | 5 | # Path to Hadoop libraries 6 | hadoop.path=/usr/share/hadoop 7 | 8 | -------------------------------------------------------------------------------- /build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Common Crawl Examples Build File 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /conf/mapred.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapred.map.tasks 4 | 1 5 | 6 | 7 | mapred.max.map.failures.percent 8 | 10 9 | 10 | 11 | -------------------------------------------------------------------------------- /lib/gson-2.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/gson-2.2.1.jar -------------------------------------------------------------------------------- /lib/guava-12.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/guava-12.0.jar -------------------------------------------------------------------------------- /lib/httpcore-4.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/httpcore-4.2.1.jar -------------------------------------------------------------------------------- /lib/jsoup-1.6.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/jsoup-1.6.3.jar -------------------------------------------------------------------------------- /src/java/org/commoncrawl/compressors/CompressorInputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | // package modified by Common Crawl - no other changes from 21 | // Apache Commons Compress 1.4.1 22 | package org.commoncrawl.compressors; 23 | 24 | import java.io.InputStream; 25 | 26 | public abstract class CompressorInputStream extends InputStream { 27 | private long bytesRead = 0; 28 | 29 | /** 30 | * Increments the counter of already read bytes. 31 | * Doesn't increment if the EOF has been hit (read == -1) 32 | * 33 | * @param read the number of bytes read 34 | * 35 | * @since 1.1 36 | */ 37 | protected void count(int read) { 38 | count((long) read); 39 | } 40 | 41 | /** 42 | * Increments the counter of already read bytes. 43 | * Doesn't increment if the EOF has been hit (read == -1) 44 | * 45 | * @param read the number of bytes read 46 | */ 47 | protected void count(long read) { 48 | if(read != -1) { 49 | bytesRead = bytesRead + read; 50 | } 51 | } 52 | 53 | /** 54 | * Returns the current number of bytes read from this stream. 55 | * @return the number of read bytes 56 | * @deprecated this method may yield wrong results for large 57 | * archives, use #getBytesRead instead 58 | */ 59 | @Deprecated 60 | public int getCount() { 61 | return (int) bytesRead; 62 | } 63 | 64 | /** 65 | * Returns the current number of bytes read from this stream. 66 | * @return the number of read bytes 67 | * 68 | * @since 1.1 69 | */ 70 | public long getBytesRead() { 71 | return bytesRead; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | // Source code taken from Apache Commons Compress 1.4.1. Feature is being 21 | // submitted to Apache project - patch will be applied if approved. 22 | package org.commoncrawl.compressors.gzip; 23 | 24 | import java.io.IOException; 25 | import java.io.EOFException; 26 | import java.io.InputStream; 27 | import java.io.DataInputStream; 28 | import java.io.BufferedInputStream; 29 | import java.util.zip.DataFormatException; 30 | import java.util.zip.Inflater; 31 | import java.util.zip.CRC32; 32 | 33 | import org.commoncrawl.compressors.CompressorInputStream; 34 | 35 | /** 36 | * Input stream that decompresses .gz files. 37 | * This supports decompressing concatenated .gz files which is important 38 | * when decompressing standalone .gz files. 39 | *

40 | * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 41 | * files: it stops after the first member and silently ignores the rest. 42 | * It doesn't leave the read position to point to the beginning of the next 43 | * member, which makes it difficult workaround the lack of concatenation 44 | * support. 45 | *

46 | * Instead of using GZIPInputStream, this class has its own .gz 47 | * container format decoder. The actual decompression is done with 48 | * {@link java.util.zip.Inflater}. 49 | */ 50 | public class GzipCompressorInputStream extends CompressorInputStream { 51 | // Header flags 52 | // private static final int FTEXT = 0x01; // Uninteresting for us 53 | private static final int FHCRC = 0x02; 54 | private static final int FEXTRA = 0x04; 55 | private static final int FNAME = 0x08; 56 | private static final int FCOMMENT = 0x10; 57 | private static final int FRESERVED = 0xE0; 58 | 59 | // Compressed input stream, possibly wrapped in a BufferedInputStream 60 | private final InputStream in; 61 | 62 | // True if decompressing multimember streams. 63 | private final boolean decompressConcatenated; 64 | 65 | // Buffer to hold the input data 66 | private final byte[] buf = new byte[8192]; 67 | 68 | // Amount of data in buf. 69 | private int bufUsed = 0; 70 | 71 | // Decompressor 72 | private Inflater inf = new Inflater(true); 73 | 74 | // CRC32 from uncompressed data 75 | private CRC32 crc = new CRC32(); 76 | 77 | private int memberSize; 78 | 79 | // True once the end of a member has been reached and 80 | // 'decompressConcatenated' is false. 81 | private boolean stoppedForEndOfMember = false; 82 | 83 | // True once the end of stream has been reached. 84 | private boolean endOfStream = false; 85 | 86 | /** 87 | * Constructs a new input stream that decompresses gzip-compressed data 88 | * from the specified input stream. 89 | *

90 | * This is equivalent to 91 | * GzipCompressorInputStream(inputStream, false) and thus 92 | * will not decompress concatenated .gz files. 93 | * 94 | * @param inputStream the InputStream from which this object should 95 | * be created of 96 | * 97 | * @throws IOException if the stream could not be created 98 | */ 99 | public GzipCompressorInputStream(InputStream inputStream) 100 | throws IOException { 101 | this(inputStream, false); 102 | } 103 | 104 | /** 105 | * Constructs a new input stream that decompresses gzip-compressed data 106 | * from the specified input stream. 107 | *

108 | * If decompressConcatenated is {@code false}: 109 | * This decompressor might read more input than it will actually use. 110 | * If inputStream supports mark and 111 | * reset, then the input position will be adjusted 112 | * so that it is right after the last byte of the compressed stream. 113 | * If mark isn't supported, the input position will be 114 | * undefined. 115 | * 116 | * @param inputStream the InputStream from which this object should 117 | * be created of 118 | * @param decompressConcatenated 119 | * if true, decompress until the end of the input; 120 | * if false, stop after the first .gz member 121 | * 122 | * @throws IOException if the stream could not be created 123 | */ 124 | public GzipCompressorInputStream(InputStream inputStream, 125 | boolean decompressConcatenated) 126 | throws IOException { 127 | // Mark support is strictly needed for concatenated files only, 128 | // but it's simpler if it is always available. 129 | if (inputStream.markSupported()) { 130 | in = inputStream; 131 | } else { 132 | in = new BufferedInputStream(inputStream); 133 | } 134 | 135 | this.decompressConcatenated = decompressConcatenated; 136 | init(true); 137 | } 138 | 139 | private boolean init(boolean isFirstMember) throws IOException { 140 | assert isFirstMember || decompressConcatenated; 141 | 142 | // Check the magic bytes without a possibility of EOFException. 143 | int magic0 = in.read(); 144 | int magic1 = in.read(); 145 | 146 | // If end of input was reached after decompressing at least 147 | // one .gz member, we have reached the end of the file successfully. 148 | if (magic0 == -1 && !isFirstMember) { 149 | endOfStream = true; 150 | return false; 151 | } 152 | 153 | if (magic0 != 31 || magic1 != 139) { 154 | throw new IOException(isFirstMember 155 | ? "Input is not in the .gz format" 156 | : "Garbage after a valid .gz stream"); 157 | } 158 | 159 | // Parsing the rest of the header may throw EOFException. 160 | DataInputStream inData = new DataInputStream(in); 161 | int method = inData.readUnsignedByte(); 162 | if (method != 8) { 163 | throw new IOException("Unsupported compression method " 164 | + method + " in the .gz header"); 165 | } 166 | 167 | int flg = inData.readUnsignedByte(); 168 | if ((flg & FRESERVED) != 0) { 169 | throw new IOException( 170 | "Reserved flags are set in the .gz header"); 171 | } 172 | 173 | inData.readInt(); // mtime, ignored 174 | inData.readUnsignedByte(); // extra flags, ignored 175 | inData.readUnsignedByte(); // operating system, ignored 176 | 177 | // Extra field, ignored 178 | if ((flg & FEXTRA) != 0) { 179 | int xlen = inData.readUnsignedByte(); 180 | xlen |= inData.readUnsignedByte() << 8; 181 | 182 | // This isn't as efficient as calling in.skip would be, 183 | // but it's lazier to handle unexpected end of input this way. 184 | // Most files don't have an extra field anyway. 185 | while (xlen-- > 0) { 186 | inData.readUnsignedByte(); 187 | } 188 | } 189 | 190 | // Original file name, ignored 191 | if ((flg & FNAME) != 0) { 192 | readToNull(inData); 193 | } 194 | 195 | // Comment, ignored 196 | if ((flg & FCOMMENT) != 0) { 197 | readToNull(inData); 198 | } 199 | 200 | // Header "CRC16" which is actually a truncated CRC32 (which isn't 201 | // as good as real CRC16). I don't know if any encoder implementation 202 | // sets this, so it's not worth trying to verify it. GNU gzip 1.4 203 | // doesn't support this field, but zlib seems to be able to at least 204 | // skip over it. 205 | if ((flg & FHCRC) != 0) { 206 | inData.readShort(); 207 | } 208 | 209 | // Reset 210 | inf.reset(); 211 | crc.reset(); 212 | memberSize = 0; 213 | 214 | return true; 215 | } 216 | 217 | private void readToNull(DataInputStream inData) throws IOException { 218 | while (inData.readUnsignedByte() != 0x00) {} 219 | } 220 | 221 | /** {@inheritDoc} */ 222 | @Override 223 | public int read() throws IOException { 224 | byte[] buf = new byte[1]; 225 | return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); 226 | } 227 | 228 | /** 229 | * {@inheritDoc} 230 | * 231 | * @since 1.1 232 | */ 233 | @Override 234 | public int read(byte[] b, int off, int len) throws IOException { 235 | 236 | if (stoppedForEndOfMember || endOfStream) { 237 | return -1; 238 | } 239 | 240 | int size = 0; 241 | 242 | while (len > 0) { 243 | if (inf.needsInput()) { 244 | // Remember the current position because we may need to 245 | // rewind after reading too much input. 246 | in.mark(buf.length); 247 | 248 | bufUsed = in.read(buf); 249 | if (bufUsed == -1) { 250 | throw new EOFException(); 251 | } 252 | 253 | inf.setInput(buf, 0, bufUsed); 254 | } 255 | 256 | int ret; 257 | try { 258 | ret = inf.inflate(b, off, len); 259 | } catch (DataFormatException e) { 260 | throw new IOException("Gzip-compressed data is corrupt"); 261 | } 262 | 263 | crc.update(b, off, ret); 264 | memberSize += ret; 265 | off += ret; 266 | len -= ret; 267 | size += ret; 268 | count(ret); 269 | 270 | if (inf.finished()) { 271 | // We may have read too many bytes. Rewind the read 272 | // position to match the actual amount used. 273 | // 274 | // NOTE: The "if" is there just in case. Since we used 275 | // in.mark earler, it should always skip enough. 276 | in.reset(); 277 | 278 | int skipAmount = bufUsed - inf.getRemaining(); 279 | if (in.skip(skipAmount) != skipAmount) { 280 | throw new IOException(); 281 | } 282 | 283 | bufUsed = 0; 284 | 285 | DataInputStream inData = new DataInputStream(in); 286 | 287 | // CRC32 288 | long crcStored = 0; 289 | for (int i = 0; i < 4; ++i) { 290 | crcStored |= (long)inData.readUnsignedByte() << (i * 8); 291 | } 292 | 293 | if (crcStored != crc.getValue()) { 294 | throw new IOException("Gzip-compressed data is corrupt " 295 | + "(CRC32 error)"); 296 | } 297 | 298 | // Uncompressed size modulo 2^32 (ISIZE in the spec) 299 | int isize = 0; 300 | for (int i = 0; i < 4; ++i) { 301 | isize |= inData.readUnsignedByte() << (i * 8); 302 | } 303 | 304 | if (isize != memberSize) { 305 | throw new IOException("Gzip-compressed data is corrupt" 306 | + "(uncompressed size mismatch)"); 307 | } 308 | 309 | 310 | if (!decompressConcatenated) { 311 | stoppedForEndOfMember = true; 312 | } 313 | 314 | // See if this is the end of the file. 315 | endOfStream = !init(false); 316 | 317 | if (stoppedForEndOfMember || endOfStream) { 318 | return size == 0 ? -1 : size; 319 | } 320 | } 321 | } 322 | 323 | return size; 324 | } 325 | 326 | /** 327 | * Checks if the signature matches what is expected for a .gz file. 328 | * 329 | * @param signature the bytes to check 330 | * @param length the number of bytes to check 331 | * @return true if this is a .gz stream, false otherwise 332 | * 333 | * @since 1.1 334 | */ 335 | public static boolean matches(byte[] signature, int length) { 336 | 337 | if (length < 2) { 338 | return false; 339 | } 340 | 341 | if (signature[0] != 31) { 342 | return false; 343 | } 344 | 345 | if (signature[1] != -117) { 346 | return false; 347 | } 348 | 349 | return true; 350 | } 351 | 352 | /** 353 | * Closes the input stream (unless it is System.in). 354 | * 355 | * @since 1.2 356 | */ 357 | @Override 358 | public void close() throws IOException { 359 | if (inf != null) { 360 | inf.end(); 361 | inf = null; 362 | } 363 | 364 | if (this.in != System.in) { 365 | this.in.close(); 366 | } 367 | } 368 | 369 | /** 370 | * Explicitly instructs the stream to allow an additional concatenated 371 | * member to be read. 372 | * 373 | * @since 1.x.x 374 | */ 375 | public boolean nextMember() { 376 | 377 | if (endOfStream) 378 | return false; 379 | 380 | stoppedForEndOfMember = false; 381 | 382 | return true; 383 | } 384 | } 385 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleArcMicroformat.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.examples; 2 | 3 | // Java classes 4 | import java.lang.IllegalArgumentException; 5 | import java.lang.Integer; 6 | import java.lang.Math; 7 | import java.lang.OutOfMemoryError; 8 | import java.io.BufferedReader; 9 | import java.io.ByteArrayInputStream; 10 | import java.io.DataOutputStream; 11 | import java.io.File; 12 | import java.io.FileReader; 13 | import java.io.IOException; 14 | import java.net.URI; 15 | import java.util.Arrays; 16 | 17 | // log4j classes 18 | import org.apache.log4j.Logger; 19 | 20 | // Hadoop classes 21 | import org.apache.hadoop.conf.Configured; 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.fs.FSDataOutputStream; 24 | import org.apache.hadoop.fs.FileStatus; 25 | import org.apache.hadoop.fs.FileSystem; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.fs.PathFilter; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.io.Text; 30 | import org.apache.hadoop.mapred.FileInputFormat; 31 | import org.apache.hadoop.mapred.FileOutputFormat; 32 | import org.apache.hadoop.mapred.InputSplit; 33 | import org.apache.hadoop.mapred.JobClient; 34 | import org.apache.hadoop.mapred.JobConf; 35 | import org.apache.hadoop.mapred.Mapper; 36 | import org.apache.hadoop.mapred.MapReduceBase; 37 | import org.apache.hadoop.mapred.OutputCollector; 38 | import org.apache.hadoop.mapred.Reporter; 39 | import org.apache.hadoop.mapred.TextOutputFormat; 40 | import org.apache.hadoop.mapred.lib.LongSumReducer; 41 | import org.apache.hadoop.util.Progressable; 42 | import org.apache.hadoop.util.Tool; 43 | import org.apache.hadoop.util.ToolRunner; 44 | 45 | // Common Crawl classes 46 | import org.commoncrawl.hadoop.mapred.ArcInputFormat; 47 | import org.commoncrawl.hadoop.mapred.ArcRecord; 48 | 49 | // jsoup classes 50 | import org.jsoup.Jsoup; 51 | import org.jsoup.nodes.Document; 52 | import org.jsoup.nodes.Element; 53 | import org.jsoup.select.Elements; 54 | 55 | /** 56 | * An example showing how to analyze the Common Crawl ARC web content files. 57 | * 58 | * @author Chris Stephens 59 | */ 60 | public class ExampleArcMicroformat 61 | extends Configured 62 | implements Tool { 63 | 64 | private static final Logger LOG = Logger.getLogger(ExampleArcMicroformat.class); 65 | 66 | /** 67 | * Maps incoming web documents to a list of Microformat 'itemtype' tags. 68 | * Filters out any non-HTML pages. 69 | * 70 | * @author Chris Stephens 71 | * 72 | * Inspired by: 73 | * 74 | * @author Manu Sporny 75 | * @author Steve Salevan 76 | */ 77 | public static class ExampleArcMicroformatMapper 78 | extends MapReduceBase 79 | implements Mapper { 80 | 81 | // create a counter group for Mapper-specific statistics 82 | private final String _counterGroup = "Custom Mapper Counters"; 83 | 84 | public void map(Text key, ArcRecord value, OutputCollector output, Reporter reporter) 85 | throws IOException { 86 | 87 | try { 88 | 89 | if (!value.getContentType().contains("html")) { 90 | reporter.incrCounter(this._counterGroup, "Skipped - Not HTML", 1); 91 | return; 92 | } 93 | 94 | // just curious how many of each content type we've seen 95 | reporter.incrCounter(this._counterGroup, "Content Type - "+value.getContentType(), 1); 96 | 97 | // ensure sample instances have enough memory to parse HTML 98 | if (value.getContentLength() > (5 * 1024 * 1024)) { 99 | reporter.incrCounter(this._counterGroup, "Skipped - HTML Too Long", 1); 100 | return; 101 | } 102 | 103 | // Count all 'itemtype' attributes referencing 'schema.org' 104 | Document doc = value.getParsedHTML(); 105 | 106 | if (doc == null) { 107 | reporter.incrCounter(this._counterGroup, "Skipped - Unable to Parse HTML", 1); 108 | return; 109 | } 110 | 111 | Elements mf = doc.select("[itemtype~=schema.org]"); 112 | 113 | if (mf.size() > 0) { 114 | for (Element e : mf) { 115 | if (e.hasAttr("itemtype")) { 116 | output.collect(new Text(e.attr("itemtype").toLowerCase().trim()), new LongWritable(1)); 117 | } 118 | } 119 | } 120 | } 121 | catch (Throwable e) { 122 | 123 | // occassionally Jsoup parser runs out of memory ... 124 | if (e.getClass().equals(OutOfMemoryError.class)) 125 | System.gc(); 126 | 127 | LOG.error("Caught Exception", e); 128 | reporter.incrCounter(this._counterGroup, "Skipped - Exception Thrown", 1); 129 | } 130 | } 131 | } 132 | 133 | /** 134 | * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the 135 | * number of files processed. 136 | * 137 | * @author Chris Stephens 138 | */ 139 | public static class SampleFilter 140 | implements PathFilter { 141 | 142 | private static int count = 0; 143 | private static int max = 999999999; 144 | 145 | public boolean accept(Path path) { 146 | 147 | if (!path.getName().endsWith(".arc.gz")) 148 | return false; 149 | 150 | SampleFilter.count++; 151 | 152 | if (SampleFilter.count > SampleFilter.max) 153 | return false; 154 | 155 | return true; 156 | } 157 | } 158 | 159 | /** 160 | * Implmentation of Tool.run() method, which builds and runs the Hadoop job. 161 | * 162 | * @param args command line parameters, less common Hadoop job parameters stripped 163 | * out and interpreted by the Tool class. 164 | * @return 0 if the Hadoop job completes successfully, 1 if not. 165 | */ 166 | @Override 167 | public int run(String[] args) 168 | throws Exception { 169 | 170 | String outputPath = null; 171 | String configFile = null; 172 | 173 | // Read the command line arguments. 174 | if (args.length < 1) 175 | throw new IllegalArgumentException("Example JAR must be passed an output path."); 176 | 177 | outputPath = args[0]; 178 | 179 | if (args.length >= 2) 180 | configFile = args[1]; 181 | 182 | // For this example, only look at a single ARC files. 183 | String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690163490/1341782443295_1551.arc.gz"; 184 | 185 | // Switch to this if you'd like to look at all ARC files. May take many minutes just to read the file listing. 186 | //String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/*.arc.gz"; 187 | 188 | // Read in any additional config parameters. 189 | if (configFile != null) { 190 | LOG.info("adding config parameters from '"+ configFile + "'"); 191 | this.getConf().addResource(configFile); 192 | } 193 | 194 | // Creates a new job configuration for this Hadoop job. 195 | JobConf job = new JobConf(this.getConf()); 196 | 197 | job.setJarByClass(ExampleArcMicroformat.class); 198 | 199 | // Scan the provided input path for ARC files. 200 | LOG.info("setting input path to '"+ inputPath + "'"); 201 | FileInputFormat.addInputPath(job, new Path(inputPath)); 202 | FileInputFormat.setInputPathFilter(job, SampleFilter.class); 203 | 204 | // Delete the output path directory if it already exists. 205 | LOG.info("clearing the output path at '" + outputPath + "'"); 206 | 207 | FileSystem fs = FileSystem.get(new URI(outputPath), job); 208 | 209 | if (fs.exists(new Path(outputPath))) 210 | fs.delete(new Path(outputPath), true); 211 | 212 | // Set the path where final output 'part' files will be saved. 213 | LOG.info("setting output path to '" + outputPath + "'"); 214 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 215 | FileOutputFormat.setCompressOutput(job, false); 216 | 217 | // Set which InputFormat class to use. 218 | job.setInputFormat(ArcInputFormat.class); 219 | 220 | // Set which OutputFormat class to use. 221 | job.setOutputFormat(TextOutputFormat.class); 222 | 223 | // Set the output data types. 224 | job.setOutputKeyClass(Text.class); 225 | job.setOutputValueClass(LongWritable.class); 226 | 227 | // Set which Mapper and Reducer classes to use. 228 | job.setMapperClass(ExampleArcMicroformat.ExampleArcMicroformatMapper.class); 229 | job.setReducerClass(LongSumReducer.class); 230 | 231 | if (JobClient.runJob(job).isSuccessful()) 232 | return 0; 233 | else 234 | return 1; 235 | } 236 | 237 | /** 238 | * Main entry point that uses the {@link ToolRunner} class to run the example 239 | * Hadoop job. 240 | */ 241 | public static void main(String[] args) 242 | throws Exception { 243 | int res = ToolRunner.run(new Configuration(), new ExampleArcMicroformat(), args); 244 | System.exit(res); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.examples; 2 | 3 | // Java classes 4 | import java.io.BufferedReader; 5 | import java.io.DataOutputStream; 6 | import java.io.File; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.net.URI; 10 | 11 | // Apache Project classes 12 | import org.apache.log4j.Logger; 13 | 14 | // Hadoop classes 15 | import org.apache.hadoop.conf.Configured; 16 | import org.apache.hadoop.conf.Configuration; 17 | import org.apache.hadoop.fs.FSDataOutputStream; 18 | import org.apache.hadoop.fs.FileSystem; 19 | import org.apache.hadoop.fs.Path; 20 | import org.apache.hadoop.fs.PathFilter; 21 | import org.apache.hadoop.io.LongWritable; 22 | import org.apache.hadoop.io.Text; 23 | import org.apache.hadoop.mapred.FileInputFormat; 24 | import org.apache.hadoop.mapred.FileOutputFormat; 25 | import org.apache.hadoop.mapred.InputSplit; 26 | import org.apache.hadoop.mapred.JobClient; 27 | import org.apache.hadoop.mapred.JobConf; 28 | import org.apache.hadoop.mapred.Mapper; 29 | import org.apache.hadoop.mapred.MapReduceBase; 30 | import org.apache.hadoop.mapred.OutputCollector; 31 | import org.apache.hadoop.mapred.Reporter; 32 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 33 | import org.apache.hadoop.mapred.TextOutputFormat; 34 | import org.apache.hadoop.mapred.lib.LongSumReducer; 35 | import org.apache.hadoop.util.Progressable; 36 | import org.apache.hadoop.util.Tool; 37 | import org.apache.hadoop.util.ToolRunner; 38 | 39 | // Google Gson classes 40 | import com.google.gson.Gson; 41 | import com.google.gson.GsonBuilder; 42 | import com.google.gson.JsonElement; 43 | import com.google.gson.JsonObject; 44 | import com.google.gson.JsonParser; 45 | 46 | // Google Guava classes 47 | import com.google.common.net.InternetDomainName; 48 | 49 | /** 50 | * An example showing how to use the Common Crawl 'metadata' files to quickly 51 | * gather high level information about the corpus' content. 52 | * 53 | * @author Chris Stephens 54 | */ 55 | public class ExampleMetadataDomainPageCount 56 | extends Configured 57 | implements Tool { 58 | 59 | private static final Logger LOG = Logger.getLogger(ExampleMetadataDomainPageCount.class); 60 | 61 | /** 62 | * Mapping class that produces the normalized domain name and a count of '1' 63 | * for every successfully retrieved URL in the Common Crawl corpus. 64 | */ 65 | public static class ExampleMetadataDomainPageCountMapper 66 | extends MapReduceBase 67 | implements Mapper { 68 | 69 | // create a counter group for Mapper-specific statistics 70 | private final String _counterGroup = "Custom Mapper Counters"; 71 | 72 | // implement the main "map" function 73 | public void map(Text key, Text value, OutputCollector output, Reporter reporter) 74 | throws IOException { 75 | 76 | // key & value are "Text" right now ... 77 | String url = key.toString(); 78 | String json = value.toString(); 79 | 80 | try { 81 | 82 | // Get the base domain name 83 | URI uri = new URI(url); 84 | String host = uri.getHost(); 85 | 86 | if (host == null) { 87 | reporter.incrCounter(this._counterGroup, "Invalid URI", 1); 88 | return; 89 | } 90 | 91 | InternetDomainName domainObj = InternetDomainName.from(host); 92 | 93 | String domain = domainObj.topPrivateDomain().name(); 94 | 95 | if (domain == null) { 96 | reporter.incrCounter(this._counterGroup, "Invalid Domain", 1); 97 | return; 98 | } 99 | 100 | // See if the page has a successful HTTP code 101 | JsonParser jsonParser = new JsonParser(); 102 | JsonObject jsonObj = jsonParser.parse(json).getAsJsonObject(); 103 | 104 | int httpCode; 105 | 106 | if (jsonObj.has("http_result") == false) { 107 | reporter.incrCounter(this._counterGroup, "HTTP Code Missing", 1); 108 | return; 109 | } 110 | 111 | if (jsonObj.get("http_result").getAsInt() == 200) { 112 | reporter.incrCounter(this._counterGroup, "HTTP Success", 1); 113 | 114 | // only output counts for pages that were successfully retrieved 115 | output.collect(new Text(domain), new LongWritable(1)); 116 | } 117 | else { 118 | reporter.incrCounter(this._counterGroup, "HTTP Not Success", 1); 119 | } 120 | } 121 | catch (IOException ex) { 122 | throw ex; 123 | } 124 | catch (Exception ex) { 125 | LOG.error("Caught Exception", ex); 126 | reporter.incrCounter(this._counterGroup, "Exceptions", 1); 127 | } 128 | } 129 | } 130 | 131 | 132 | /** 133 | * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the 134 | * number of files processed. 135 | * 136 | * @author Chris Stephens 137 | */ 138 | public static class SampleFilter 139 | implements PathFilter { 140 | 141 | private static int count = 0; 142 | private static int max = 999999999; 143 | 144 | public boolean accept(Path path) { 145 | 146 | if (!path.getName().startsWith("metadata-")) 147 | return false; 148 | 149 | SampleFilter.count++; 150 | 151 | if (SampleFilter.count > SampleFilter.max) 152 | return false; 153 | 154 | return true; 155 | } 156 | } 157 | 158 | /** 159 | * Implmentation of Tool.run() method, which builds and runs the Hadoop job. 160 | * 161 | * @param args command line parameters, less common Hadoop job parameters stripped 162 | * out and interpreted by the Tool class. 163 | * @return 0 if the Hadoop job completes successfully, 1 if not. 164 | */ 165 | @Override 166 | public int run(String[] args) 167 | throws Exception { 168 | 169 | String outputPath = null; 170 | String configFile = null; 171 | 172 | // Read the command line arguments. 173 | if (args.length < 1) 174 | throw new IllegalArgumentException("Example JAR must be passed an output path."); 175 | 176 | outputPath = args[0]; 177 | 178 | if (args.length >= 2) 179 | configFile = args[1]; 180 | 181 | // For this example, only look at a single metadata file. 182 | String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690166822/metadata-01849"; 183 | 184 | // Switch to this if you'd like to look at all metadata files. May take many minutes just to read the file listing. 185 | // String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/metadata-*"; 186 | 187 | // Read in any additional config parameters. 188 | if (configFile != null) { 189 | LOG.info("adding config parameters from '"+ configFile + "'"); 190 | this.getConf().addResource(configFile); 191 | } 192 | 193 | // Creates a new job configuration for this Hadoop job. 194 | JobConf job = new JobConf(this.getConf()); 195 | 196 | job.setJarByClass(ExampleMetadataDomainPageCount.class); 197 | 198 | // Scan the provided input path for ARC files. 199 | LOG.info("setting input path to '"+ inputPath + "'"); 200 | FileInputFormat.addInputPath(job, new Path(inputPath)); 201 | 202 | // Optionally, you can add in a custom input path filter 203 | // FileInputFormat.setInputPathFilter(job, SampleFilter.class); 204 | 205 | // Delete the output path directory if it already exists. 206 | LOG.info("clearing the output path at '" + outputPath + "'"); 207 | 208 | FileSystem fs = FileSystem.get(new URI(outputPath), job); 209 | 210 | if (fs.exists(new Path(outputPath))) 211 | fs.delete(new Path(outputPath), true); 212 | 213 | // Set the path where final output 'part' files will be saved. 214 | LOG.info("setting output path to '" + outputPath + "'"); 215 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 216 | FileOutputFormat.setCompressOutput(job, false); 217 | 218 | // Set which InputFormat class to use. 219 | job.setInputFormat(SequenceFileInputFormat.class); 220 | 221 | // Set which OutputFormat class to use. 222 | job.setOutputFormat(TextOutputFormat.class); 223 | 224 | // Set the output data types. 225 | job.setOutputKeyClass(Text.class); 226 | job.setOutputValueClass(LongWritable.class); 227 | 228 | // Set which Mapper and Reducer classes to use. 229 | job.setMapperClass(ExampleMetadataDomainPageCount.ExampleMetadataDomainPageCountMapper.class); 230 | job.setReducerClass(LongSumReducer.class); 231 | 232 | if (JobClient.runJob(job).isSuccessful()) 233 | return 0; 234 | else 235 | return 1; 236 | } 237 | 238 | /** 239 | * Main entry point that uses the {@link ToolRunner} class to run the example 240 | * Hadoop job. 241 | */ 242 | public static void main(String[] args) 243 | throws Exception { 244 | int res = ToolRunner.run(new Configuration(), new ExampleMetadataDomainPageCount(), args); 245 | System.exit(res); 246 | } 247 | } 248 | 249 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleMetadataStats.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.examples; 2 | 3 | // Java classes 4 | import java.io.BufferedReader; 5 | import java.io.DataOutputStream; 6 | import java.io.File; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.net.URISyntaxException; 11 | 12 | // Apache Project classes 13 | import org.apache.log4j.Logger; 14 | 15 | // Hadoop classes 16 | import org.apache.hadoop.conf.Configured; 17 | import org.apache.hadoop.conf.Configuration; 18 | import org.apache.hadoop.fs.FSDataOutputStream; 19 | import org.apache.hadoop.fs.FileStatus; 20 | import org.apache.hadoop.fs.FileSystem; 21 | import org.apache.hadoop.fs.Path; 22 | import org.apache.hadoop.fs.PathFilter; 23 | import org.apache.hadoop.io.LongWritable; 24 | import org.apache.hadoop.io.Text; 25 | import org.apache.hadoop.mapred.FileInputFormat; 26 | import org.apache.hadoop.mapred.FileOutputFormat; 27 | import org.apache.hadoop.mapred.InputSplit; 28 | import org.apache.hadoop.mapred.JobClient; 29 | import org.apache.hadoop.mapred.JobConf; 30 | import org.apache.hadoop.mapred.Mapper; 31 | import org.apache.hadoop.mapred.MapReduceBase; 32 | import org.apache.hadoop.mapred.OutputCollector; 33 | import org.apache.hadoop.mapred.Reporter; 34 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 35 | import org.apache.hadoop.mapred.TextOutputFormat; 36 | import org.apache.hadoop.mapred.lib.LongSumReducer; 37 | import org.apache.hadoop.util.Progressable; 38 | import org.apache.hadoop.util.Tool; 39 | import org.apache.hadoop.util.ToolRunner; 40 | 41 | // Google Gson classes 42 | import com.google.gson.Gson; 43 | import com.google.gson.GsonBuilder; 44 | import com.google.gson.JsonElement; 45 | import com.google.gson.JsonObject; 46 | import com.google.gson.JsonParser; 47 | 48 | // Google Guava classes 49 | import com.google.common.net.InternetDomainName; 50 | 51 | /** 52 | * An example showing how to use the Common Crawl 'metadata' files to quickly 53 | * gather high level information about the corpus' content. 54 | * 55 | * @author Chris Stephens 56 | */ 57 | public class ExampleMetadataStats 58 | extends Configured 59 | implements Tool { 60 | 61 | private static final Logger LOG = Logger.getLogger(ExampleMetadataStats.class); 62 | 63 | /** 64 | * Mapping class that produces statistics about the Common Crawl corpus. 65 | */ 66 | public static class ExampleMetadataStatsMapper 67 | extends MapReduceBase 68 | implements Mapper { 69 | 70 | // create a counter group for Mapper-specific statistics 71 | private final String _counterGroup = "Custom Mapper Counters"; 72 | 73 | // implement the main "map" function 74 | public void map(Text key, Text value, OutputCollector output, Reporter reporter) 75 | throws IOException { 76 | 77 | // key & value are "Text" right now ... 78 | String url = key.toString(); 79 | String json = value.toString(); 80 | 81 | try { 82 | 83 | // See if the page has a successful HTTP code 84 | JsonParser jsonParser = new JsonParser(); 85 | JsonObject jsonObj = jsonParser.parse(json).getAsJsonObject(); 86 | 87 | boolean isSuccessful = false; 88 | 89 | String disposition = "[no status]"; 90 | 91 | if (jsonObj.has("disposition")) 92 | { 93 | disposition = jsonObj.get("disposition").getAsString().trim().toUpperCase(); 94 | 95 | if (disposition.equals("SUCCESS")) 96 | isSuccessful = true; 97 | } 98 | 99 | // Output a basic page count 100 | output.collect(new Text("Pages Requested\tTotal"), new LongWritable(1)); 101 | 102 | output.collect(new Text("Pages Requested\t"+disposition), new LongWritable(1)); 103 | 104 | // Output the HTTP result 105 | String httpResult = "[missing]"; 106 | 107 | if (jsonObj.has("http_result")) 108 | httpResult = jsonObj.get("http_result").getAsString().trim().toUpperCase(); 109 | 110 | output.collect(new Text("HTTP Code\t"+httpResult+" ("+disposition+")"), new LongWritable(1)); 111 | 112 | // If the request was not successful, move to the next record 113 | if (isSuccessful == false) 114 | return; 115 | 116 | // Gather the host name 117 | try { 118 | 119 | URI uri = new URI(url); 120 | String host = uri.getHost(); 121 | 122 | if (host == null || host.equals("")) 123 | throw new URISyntaxException(url, "Unable to gather host or no host found"); 124 | 125 | // Gather the domain object 126 | InternetDomainName domainObj = InternetDomainName.from(host); 127 | 128 | // Output the TLD 129 | String publicSuffix = "[none]"; 130 | 131 | if (domainObj.hasPublicSuffix()) 132 | publicSuffix = domainObj.publicSuffix().name().trim().toLowerCase(); 133 | 134 | output.collect(new Text("TLD\t"+publicSuffix), new LongWritable(1)); 135 | 136 | // Output the private domain 137 | // WARNING - This dramatically increases the size of the output. 138 | String privateDomain = "[invalid]"; 139 | 140 | if (domainObj.topPrivateDomain() != null) 141 | privateDomain = domainObj.topPrivateDomain().name().trim().toLowerCase(); 142 | 143 | //output.collect(new Text("Domain\t"+privateDomain), new LongWritable(1)); 144 | } 145 | catch (URISyntaxException ex) { 146 | output.collect(new Text("TLD\t[invalid URL]"), new LongWritable(1)); 147 | reporter.incrCounter(this._counterGroup, "Invalid URLs", 1); 148 | } 149 | 150 | // Output MIME Type 151 | String mimeType = "[missing]"; 152 | 153 | if (jsonObj.has("mime_type")) 154 | mimeType = jsonObj.get("mime_type").getAsString().trim().toLowerCase(); 155 | 156 | output.collect(new Text("Type\t"+mimeType), new LongWritable(1)); 157 | 158 | // Output Charset 159 | String charset = "[missing]"; 160 | 161 | if (jsonObj.has("charset_detected")) 162 | charset = jsonObj.get("charset_detected").getAsString().trim().toUpperCase(); 163 | 164 | output.collect(new Text("Charset\t"+charset), new LongWritable(1)); 165 | 166 | // Download Size 167 | if (jsonObj.has("download_size") == true) 168 | output.collect(new Text("Content Size\t"), new LongWritable(jsonObj.get("download_size").getAsInt())); 169 | } 170 | catch (IOException ex) { 171 | throw ex; 172 | } 173 | catch (Exception ex) { 174 | LOG.error("Caught Exception", ex); 175 | reporter.incrCounter(this._counterGroup, "Exceptions", 1); 176 | } 177 | } 178 | } 179 | 180 | /** 181 | * Implmentation of Tool.run() method, which builds and runs the Hadoop job. 182 | * 183 | * @param args command line parameters, less common Hadoop job parameters stripped 184 | * out and interpreted by the Tool class. 185 | * @return 0 if the Hadoop job completes successfully, 1 if not. 186 | */ 187 | @Override 188 | public int run(String[] args) 189 | throws Exception { 190 | 191 | String baseInputPath = null; 192 | String outputPath = null; 193 | 194 | // Read the command line arguments. 195 | if (args.length < 1) 196 | throw new IllegalArgumentException("'run()' must be passed an output path."); 197 | 198 | outputPath = args[0]; 199 | 200 | // Creates a new job configuration for this Hadoop job. 201 | JobConf job = new JobConf(this.getConf()); 202 | 203 | job.setJarByClass(ExampleMetadataStats.class); 204 | 205 | baseInputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment"; 206 | 207 | FileSystem fs = null; 208 | 209 | // If you would like to process all segments, comment this out and 210 | // uncomment the block of code below 211 | String inputPath = baseInputPath + "/1341690154994/metadata-00062"; 212 | 213 | LOG.info("adding input path '" + inputPath + "'"); 214 | FileInputFormat.addInputPath(job, new Path(inputPath)); 215 | /* 216 | fs = FileSystem.get(new URI("s3n://aws-publicdatasets"), job); 217 | 218 | for (FileStatus fileStatus : fs.globStatus(new Path("/common-crawl/parse-output/valid_segments/[0-9]*"))) { 219 | String[] parts = fileStatus.getPath().toString().split("/"); 220 | String inputPath = baseInputPath + "/" + parts[parts.length-1] + "/metadata-*"; 221 | LOG.info("adding input path '" + inputPath + "'"); 222 | FileInputFormat.addInputPath(job, new Path(inputPath)); 223 | } 224 | */ 225 | 226 | // Delete the output path directory if it already exists. 227 | LOG.info("clearing the output path at '" + outputPath + "'"); 228 | 229 | fs = FileSystem.get(new URI(outputPath), job); 230 | 231 | if (fs.exists(new Path(outputPath))) 232 | fs.delete(new Path(outputPath), true); 233 | 234 | // Set the path where final output 'part' files will be saved. 235 | LOG.info("setting output path to '" + outputPath + "'"); 236 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 237 | FileOutputFormat.setCompressOutput(job, false); 238 | 239 | // Set which InputFormat class to use. 240 | job.setInputFormat(SequenceFileInputFormat.class); 241 | 242 | // Set which OutputFormat class to use. 243 | job.setOutputFormat(TextOutputFormat.class); 244 | 245 | // Set the output data types. 246 | job.setOutputKeyClass(Text.class); 247 | job.setOutputValueClass(LongWritable.class); 248 | 249 | // Set which Mapper and Reducer classes to use. 250 | job.setMapperClass(ExampleMetadataStats.ExampleMetadataStatsMapper.class); 251 | job.setCombinerClass(LongSumReducer.class); 252 | job.setReducerClass(LongSumReducer.class); 253 | 254 | if (JobClient.runJob(job).isSuccessful()) 255 | return 0; 256 | else 257 | return 1; 258 | } 259 | 260 | /** 261 | * Main entry point that uses the {@link ToolRunner} class to run the example 262 | * Hadoop job. 263 | */ 264 | public static void main(String[] args) 265 | throws Exception { 266 | 267 | int res = ToolRunner.run(new Configuration(), new ExampleMetadataStats(), args); 268 | System.exit(res); 269 | } 270 | } 271 | 272 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleTextWordCount.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.examples; 2 | 3 | // Java classes 4 | import java.lang.Math; 5 | import java.io.BufferedReader; 6 | import java.io.DataOutputStream; 7 | import java.io.File; 8 | import java.io.FileReader; 9 | import java.io.IOException; 10 | import java.net.URI; 11 | 12 | // Apache Project classes 13 | import org.apache.log4j.Logger; 14 | 15 | // Hadoop classes 16 | import org.apache.hadoop.conf.Configured; 17 | import org.apache.hadoop.conf.Configuration; 18 | import org.apache.hadoop.fs.FSDataOutputStream; 19 | import org.apache.hadoop.fs.FileSystem; 20 | import org.apache.hadoop.fs.Path; 21 | import org.apache.hadoop.fs.PathFilter; 22 | import org.apache.hadoop.io.LongWritable; 23 | import org.apache.hadoop.io.Text; 24 | import org.apache.hadoop.mapred.FileInputFormat; 25 | import org.apache.hadoop.mapred.FileOutputFormat; 26 | import org.apache.hadoop.mapred.InputSplit; 27 | import org.apache.hadoop.mapred.JobClient; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.Mapper; 30 | import org.apache.hadoop.mapred.MapReduceBase; 31 | import org.apache.hadoop.mapred.OutputCollector; 32 | import org.apache.hadoop.mapred.Reporter; 33 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 34 | import org.apache.hadoop.mapred.TextOutputFormat; 35 | import org.apache.hadoop.mapred.lib.LongSumReducer; 36 | import org.apache.hadoop.util.Progressable; 37 | import org.apache.hadoop.util.Tool; 38 | import org.apache.hadoop.util.ToolRunner; 39 | 40 | /** 41 | * An example showing how to use the Common Crawl 'textData' files to efficiently 42 | * work with Common Crawl corpus text content. 43 | * 44 | * @author Chris Stephens 45 | */ 46 | public class ExampleTextWordCount extends Configured implements Tool { 47 | 48 | private static final Logger LOG = Logger.getLogger(ExampleTextWordCount.class); 49 | 50 | /** 51 | * Perform a simple word count mapping on text data from the Common Crawl corpus. 52 | */ 53 | public static class ExampleTextWordCountMapper 54 | extends MapReduceBase 55 | implements Mapper { 56 | 57 | // create a counter group for Mapper-specific statistics 58 | private final String _counterGroup = "Custom Mapper Counters"; 59 | 60 | public void map(Text key, Text value, OutputCollector output, Reporter reporter) 61 | throws IOException { 62 | 63 | reporter.incrCounter(this._counterGroup, "Records In", 1); 64 | 65 | try { 66 | 67 | // Get the text content as a string. 68 | String pageText = value.toString(); 69 | 70 | // Removes all punctuation. 71 | pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", ""); 72 | 73 | // Normalizes whitespace to single spaces. 74 | pageText = pageText.replaceAll("\\s+", " "); 75 | 76 | if (pageText == null || pageText == "") { 77 | reporter.incrCounter(this._counterGroup, "Skipped - Empty Page Text", 1); 78 | } 79 | 80 | // Splits by space and outputs to OutputCollector. 81 | for (String word : pageText.split(" ")) { 82 | output.collect(new Text(word.toLowerCase()), new LongWritable(1)); 83 | } 84 | } 85 | catch (Exception ex) { 86 | LOG.error("Caught Exception", ex); 87 | reporter.incrCounter(this._counterGroup, "Exceptions", 1); 88 | } 89 | } 90 | } 91 | 92 | /** 93 | * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the 94 | * number of files processed. 95 | * 96 | * @author Chris Stephens 97 | */ 98 | public static class SampleFilter 99 | implements PathFilter { 100 | 101 | private static int count = 0; 102 | private static int max = 999999999; 103 | 104 | public boolean accept(Path path) { 105 | 106 | if (!path.getName().startsWith("textData-")) 107 | return false; 108 | 109 | SampleFilter.count++; 110 | 111 | if (SampleFilter.count > SampleFilter.max) 112 | return false; 113 | 114 | return true; 115 | } 116 | } 117 | 118 | /** 119 | * Implmentation of Tool.run() method, which builds and runs the Hadoop job. 120 | * 121 | * @param args command line parameters, less common Hadoop job parameters stripped 122 | * out and interpreted by the Tool class. 123 | * @return 0 if the Hadoop job completes successfully, 1 if not. 124 | */ 125 | @Override 126 | public int run(String[] args) 127 | throws Exception { 128 | 129 | String outputPath = null; 130 | String configFile = null; 131 | 132 | // Read the command line arguments. 133 | if (args.length < 1) 134 | throw new IllegalArgumentException("Example JAR must be passed an output path."); 135 | 136 | outputPath = args[0]; 137 | 138 | if (args.length >= 2) 139 | configFile = args[1]; 140 | 141 | // For this example, only look at a single text file. 142 | String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690166822/textData-01666"; 143 | 144 | // Switch to this if you'd like to look at all text files. May take many minutes just to read the file listing. 145 | //String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/textData-*"; 146 | 147 | // Read in any additional config parameters. 148 | if (configFile != null) { 149 | LOG.info("adding config parameters from '"+ configFile + "'"); 150 | this.getConf().addResource(configFile); 151 | } 152 | 153 | // Creates a new job configuration for this Hadoop job. 154 | JobConf job = new JobConf(this.getConf()); 155 | 156 | job.setJarByClass(ExampleTextWordCount.class); 157 | 158 | // Scan the provided input path for ARC files. 159 | LOG.info("setting input path to '"+ inputPath + "'"); 160 | FileInputFormat.addInputPath(job, new Path(inputPath)); 161 | FileInputFormat.setInputPathFilter(job, SampleFilter.class); 162 | 163 | // Delete the output path directory if it already exists. 164 | LOG.info("clearing the output path at '" + outputPath + "'"); 165 | 166 | FileSystem fs = FileSystem.get(new URI(outputPath), job); 167 | 168 | if (fs.exists(new Path(outputPath))) 169 | fs.delete(new Path(outputPath), true); 170 | 171 | // Set the path where final output 'part' files will be saved. 172 | LOG.info("setting output path to '" + outputPath + "'"); 173 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 174 | FileOutputFormat.setCompressOutput(job, false); 175 | 176 | // Set which InputFormat class to use. 177 | job.setInputFormat(SequenceFileInputFormat.class); 178 | 179 | // Set which OutputFormat class to use. 180 | job.setOutputFormat(TextOutputFormat.class); 181 | 182 | // Set the output data types. 183 | job.setOutputKeyClass(Text.class); 184 | job.setOutputValueClass(LongWritable.class); 185 | 186 | // Set which Mapper and Reducer classes to use. 187 | job.setMapperClass(ExampleTextWordCount.ExampleTextWordCountMapper.class); 188 | job.setReducerClass(LongSumReducer.class); 189 | 190 | if (JobClient.runJob(job).isSuccessful()) 191 | return 0; 192 | else 193 | return 1; 194 | } 195 | 196 | /** 197 | * Main entry point that uses the {@link ToolRunner} class to run the example 198 | * Hadoop job. 199 | */ 200 | public static void main(String[] args) 201 | throws Exception { 202 | int res = ToolRunner.run(new Configuration(), new ExampleTextWordCount(), args); 203 | System.exit(res); 204 | } 205 | } 206 | 207 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapred.FileInputFormat; 9 | import org.apache.hadoop.mapred.FileSplit; 10 | import org.apache.hadoop.mapred.InputSplit; 11 | import org.apache.hadoop.mapred.JobConf; 12 | import org.apache.hadoop.mapred.RecordReader; 13 | import org.apache.hadoop.mapred.Reporter; 14 | 15 | /** 16 | * A input format the reads arc files. 17 | */ 18 | public class ArcInputFormat 19 | extends FileInputFormat { 20 | 21 | /** 22 | * Returns the RecordReader for reading the arc file. 23 | * 24 | * @param split The InputSplit of the arc file to process. 25 | * @param job The job configuration. 26 | * @param reporter The progress reporter. 27 | */ 28 | public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) 29 | throws IOException { 30 | reporter.setStatus(split.toString()); 31 | return new ArcRecordReader(job, (FileSplit)split); 32 | } 33 | 34 | /** 35 | *

Always returns false to indicate that ARC files are not splittable.

36 | *

ARC files are stored in 100MB files, meaning they will be stored in at 37 | * most 3 blocks (2 blocks on Hadoop systems with 128MB block size).

38 | */ 39 | protected boolean isSplitable(FileSystem fs, Path filename) { 40 | return false; 41 | } 42 | } 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.DataInput; 5 | import java.io.DataOutput; 6 | import java.io.EOFException; 7 | import java.io.InputStream; 8 | import java.io.IOException; 9 | import java.lang.IllegalArgumentException; 10 | import java.lang.Integer; 11 | import java.lang.Math; 12 | import java.lang.NumberFormatException; 13 | import java.text.ParseException; 14 | import java.text.SimpleDateFormat; 15 | import java.util.Arrays; 16 | import java.util.Date; 17 | 18 | // Hadoop classes 19 | import org.apache.hadoop.io.Writable; 20 | 21 | // Apache log4j classes 22 | import org.apache.log4j.Logger; 23 | 24 | // Apache HTTP Components classes 25 | import org.apache.http.Header; 26 | import org.apache.http.HeaderElement; 27 | import org.apache.http.HttpException; 28 | import org.apache.http.HttpResponse; 29 | import org.apache.http.StatusLine; 30 | import org.apache.http.entity.InputStreamEntity; 31 | import org.apache.http.entity.ContentType; 32 | import org.apache.http.impl.DefaultHttpResponseFactory; 33 | import org.apache.http.impl.io.AbstractSessionInputBuffer; 34 | import org.apache.http.impl.io.DefaultHttpResponseParser; 35 | import org.apache.http.io.SessionInputBuffer; 36 | import org.apache.http.message.BasicHeader; 37 | import org.apache.http.message.BasicHeaderValueParser; 38 | import org.apache.http.message.BasicLineParser; 39 | import org.apache.http.params.BasicHttpParams; 40 | 41 | // Jsoup classes 42 | import org.jsoup.Jsoup; 43 | import org.jsoup.nodes.Document; 44 | 45 | /** 46 | * An entry in an ARC (Internet Archive) data file. 47 | * 48 | * @author Chris Stephens 49 | */ 50 | public class ArcRecord 51 | implements Writable { 52 | 53 | private static final Logger LOG = Logger.getLogger(ArcRecord.class); 54 | 55 | // ARC v1 metadata 56 | private String _url; 57 | private String _ipAddress; 58 | private Date _archiveDate; 59 | private String _contentType; 60 | private int _contentLength; 61 | 62 | // ARC v2 metadata 63 | //private int _resultCode; 64 | //private String _checksum; 65 | //private String _location; 66 | //private long _offset; 67 | //private String _filename; 68 | 69 | private byte[] _payload; 70 | 71 | private HttpResponse _httpResponse; 72 | 73 | private int _httpContentStart; 74 | 75 | /** 76 | *

Creates an empty ARC record.

77 | */ 78 | public ArcRecord() { } 79 | 80 | private void _clear() { 81 | this._url = null; 82 | this._ipAddress = null; 83 | this._archiveDate = null; 84 | this._contentType = null; 85 | this._contentLength = 0; 86 | this._payload = null; 87 | this._httpResponse = null; 88 | } 89 | 90 | private String _readLine(InputStream in) 91 | throws IOException, EOFException { 92 | 93 | StringBuffer line = new StringBuffer(128); 94 | 95 | // read a line of content 96 | int b = in.read(); 97 | int n = 1; 98 | 99 | // if -1 is returned, we are at EOF 100 | if (b == -1) 101 | throw new EOFException(); 102 | 103 | // read until an NL 104 | do { 105 | 106 | if (((char) b) == '\n') 107 | break; 108 | 109 | line.append((char) b); 110 | 111 | b = in.read(); 112 | n++; 113 | } 114 | while (b != -1); 115 | 116 | return line.toString(); 117 | } 118 | 119 | /** 120 | *

Parses the ARC record header and payload (content) from a stream.

121 | * 122 | * @return TRUE if the ARC record was parsed and loaded successfully, FALSE if not. 123 | */ 124 | public boolean readFrom(InputStream in) 125 | throws IOException, EOFException { 126 | 127 | if (in == null) { 128 | LOG.error("ArcRecord cannot be created from NULL/missing input stream."); 129 | return false; 130 | } 131 | 132 | // Clear any current values assigned to the object. 133 | this._clear(); 134 | 135 | // Read the ARC header from the stream. 136 | String arcRecordHeader = this._readLine(in); 137 | 138 | try { 139 | this.setArcRecordHeader(arcRecordHeader); 140 | this.setPayload(in); 141 | } 142 | catch (IOException ex) { 143 | throw ex; 144 | } 145 | catch (Exception ex) { 146 | LOG.error("Exception thrown while parsing ARC record", ex); 147 | return false; 148 | } 149 | 150 | return true; 151 | } 152 | 153 | /** 154 | *

Parses and sets the ARC record header fields.

155 | *

Currently, this method expects the ARC record header string to contain 156 | * the following fields, in order, separated by space: 157 | *

164 | *

165 | *

For more information on the arc file format, see 166 | * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.

167 | * 168 | * @param arcRecordHeader The first line of an ARC file entry - the header 169 | * line for an ARC file item. 170 | */ 171 | public void setArcRecordHeader(String arcRecordHeader) 172 | throws IllegalArgumentException, ParseException { 173 | 174 | if (arcRecordHeader == null || arcRecordHeader.equals("")) 175 | throw new IllegalArgumentException("ARC v1 record header string is empty."); 176 | 177 | String[] metadata = arcRecordHeader.split(" "); 178 | 179 | if (metadata.length != 5) { 180 | LOG.info(" [ "+arcRecordHeader+" ] "); 181 | throw new IllegalArgumentException("ARC v1 record header must be 5 fields."); 182 | } 183 | 184 | SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss"); 185 | 186 | this._url = metadata[0]; 187 | this._ipAddress = metadata[1]; 188 | this._archiveDate = format.parse(metadata[2]); 189 | this._contentType = metadata[3]; 190 | this._contentLength = (new Integer(metadata[4])).intValue(); 191 | } 192 | 193 | /** 194 | *

Reads and sets the ARC record payload from an input stream.

195 | * 196 | * @param in An input stream positioned at the start of the ARC record payload. 197 | */ 198 | public void setPayload(InputStream in) 199 | throws IllegalArgumentException, ParseException, IOException { 200 | 201 | if (in == null) 202 | throw new IllegalArgumentException("ArcRecord cannot be created from NULL/missing input stream."); 203 | 204 | int bufferSize = this._contentLength; 205 | 206 | this._payload = new byte[bufferSize]; 207 | 208 | int n = in.read(this._payload, 0, this._payload.length); 209 | 210 | if (n < this._payload.length) { 211 | LOG.warn("Expecting "+bufferSize+" bytes in ARC record payload, found "+n+" bytes. Performing array copy."); 212 | this._payload = Arrays.copyOf(this._payload, n); 213 | } 214 | 215 | // After this, we should be at the end of this GZIP member. Let the 216 | // calling function verify the position of the stream. 217 | } 218 | 219 | public void addToPayload(byte[] data) { 220 | this.addToPayload(data, data.length); 221 | } 222 | 223 | public void addToPayload(byte[] data, int length) { 224 | 225 | LOG.warn("Content Length must have been incorrect - someone needed to add more data to the payload."); 226 | 227 | if (this._payload == null) { 228 | this._payload = Arrays.copyOf(data, length); 229 | } 230 | else { 231 | int i = this._payload.length; 232 | int n = this._payload.length + length; 233 | 234 | // resize the payload buffer 235 | this._payload = Arrays.copyOf(this._payload, n); 236 | 237 | // copy in the additional data 238 | System.arraycopy(data, 0, this._payload, i, length); 239 | } 240 | } 241 | 242 | /** 243 | * {@inheritDoc} 244 | */ 245 | public String toString() { 246 | return this._url + " - " + this._archiveDate.toString() + " - " + this._contentType; 247 | } 248 | 249 | /** 250 | * {@inheritDoc} 251 | */ 252 | public void write(DataOutput out) 253 | throws IOException { 254 | 255 | // write out ARC header info 256 | out.writeUTF(this._url); 257 | out.writeUTF(this._ipAddress); 258 | out.writeUTF(this._contentType); 259 | out.writeLong(this._archiveDate.getTime()); 260 | out.writeInt(this._contentLength); 261 | 262 | // write out the payload 263 | out.writeInt(this._payload.length); 264 | out.write(this._payload, 0, this._payload.length); 265 | } 266 | 267 | /** 268 | * {@inheritDoc} 269 | */ 270 | public void readFields(DataInput in) 271 | throws IOException { 272 | 273 | // read in ARC header info 274 | this._url = in.readUTF(); 275 | this._ipAddress = in.readUTF(); 276 | this._contentType = in.readUTF(); 277 | this._archiveDate = new Date(in.readLong()); 278 | this._contentLength = in.readInt(); 279 | 280 | // read in the payload 281 | int payloadLength = in.readInt(); 282 | 283 | // resize the payload buffer if necessary 284 | if (this._payload == null || this._payload.length != payloadLength) 285 | this._payload = new byte[payloadLength]; 286 | 287 | try { 288 | in.readFully(this._payload, 0, payloadLength); 289 | } 290 | catch (EOFException ex) { 291 | throw new IOException("End of input reached before payload was fully deserialized."); 292 | } 293 | 294 | // assume that if a new payload was loaded, HTTP response will need to be reparsed. 295 | this._httpResponse = null; 296 | } 297 | 298 | /** 299 | *

Returns the full ARC record payload. This is usually a complete HTTP 300 | * response.

301 | * 302 | * @return The raw ARC record content. 303 | */ 304 | public byte[] getPayload() { 305 | return this._payload; 306 | } 307 | 308 | /** 309 | *

Returns the URL from the ARC record header.

310 | * 311 | * @return The URL for this entry. 312 | */ 313 | public String getURL() { 314 | return this._url; 315 | } 316 | 317 | /** 318 | *

Returns the IP address from the ARC record header.

319 | * 320 | * @return The IP address for this entry. 321 | */ 322 | public String getIpAddress() { 323 | return this._ipAddress; 324 | } 325 | 326 | /** 327 | *

Returns the archive date from the ARC record header.

328 | * 329 | * @return The archive date for this entry. 330 | */ 331 | public Date getArchiveDate() { 332 | return this._archiveDate; 333 | } 334 | 335 | /** 336 | *

Returns the MIME content type from the ARC record header.

337 | *

Note: The MIME content type in the ARC record header is not necessarily the 338 | * same as the Content-Type HTTP header inside the content body 339 | * (if one is present).

340 | * 341 | * @return The MIME content type for this entry. 342 | */ 343 | public String getContentType() { 344 | return this._contentType; 345 | } 346 | 347 | /** 348 | *

Returns the content length from the ARC record header.

349 | *

Note: The content length in the ARC record header is not necessarily the 350 | * same as the Content-Length HTTP header inside the content body 351 | * (if one is present).

352 | * 353 | * @return The content length for this entry. 354 | */ 355 | public int getContentLength() { 356 | return this._contentLength; 357 | } 358 | 359 | /** 360 | *

Returns the HTTP status code.

361 | *

If the payload could not be parsed as an HTTP response, returns -1.

362 | *

Warning: if the payload has not yet been parsed as an HTTP response, 363 | * calling this function parses the full response. Parsing is only performed 364 | * once - parsed data is retained for subsequent calls.

365 | * 366 | * @return The HTTP status code. 367 | */ 368 | public int getHttpStatusCode() 369 | throws IOException, HttpException { 370 | 371 | HttpResponse httpResponse = this.getHttpResponse(); 372 | 373 | if (httpResponse == null) 374 | return -1; 375 | 376 | return httpResponse.getStatusLine().getStatusCode(); 377 | } 378 | 379 | /** 380 | *

Returns an array of HTTP headers.

381 | *

If the payload could not be parsed as an HTTP response, returns null.

382 | *

Warning: if the payload has not yet been parsed as an HTTP response, 383 | * calling this function parses the full response. Parsing is only performed 384 | * once - parsed data is retained for subsequent calls.

385 | * 386 | * @return An array of HTTP headers. 387 | */ 388 | public Header[] getHttpHeaders() 389 | throws IOException, HttpException { 390 | 391 | HttpResponse httpResponse = this.getHttpResponse(); 392 | 393 | if (httpResponse == null) 394 | return null; 395 | 396 | return httpResponse.getAllHeaders(); 397 | } 398 | 399 | /** 400 | * 401 | */ 402 | public static class ByteArraySessionInputBuffer 403 | extends AbstractSessionInputBuffer { 404 | 405 | public ByteArraySessionInputBuffer(byte[] buf) { 406 | BasicHttpParams params = new BasicHttpParams(); 407 | this.init(new ByteArrayInputStream(buf), 4096, params); 408 | } 409 | 410 | public ByteArraySessionInputBuffer(byte[] buf, int offset, int length) { 411 | BasicHttpParams params = new BasicHttpParams(); 412 | this.init(new ByteArrayInputStream(buf, offset, length), 4096, params); 413 | } 414 | 415 | public boolean isDataAvailable(int timeout) { 416 | return true; 417 | } 418 | } 419 | 420 | /** 421 | *

Helper function to search a byte array for CR-LF-CR-LF (the end of 422 | * HTTP headers in the payload buffer).

423 | * 424 | * @return The offset of the end of HTTP headers, after the last CRLF. 425 | */ 426 | private int _searchForCRLFCRLF(byte[] data) { 427 | 428 | final byte CR = (byte)'\r'; 429 | final byte LF = (byte)'\n'; 430 | 431 | int i; 432 | int s = 0; 433 | 434 | for (i = 0; i < data.length; i++) { 435 | 436 | if (data[i] == CR) { 437 | if (s == 0) s = 1; 438 | else if (s == 1) s = 0; 439 | else if (s == 2) s = 3; 440 | else if (s == 3) s = 0; 441 | } 442 | else if (data[i] == LF) { 443 | if (s == 0) s = 0; 444 | else if (s == 1) s = 2; 445 | else if (s == 2) s = 0; 446 | else if (s == 3) s = 4; 447 | } 448 | else { 449 | s = 0; 450 | } 451 | 452 | if (s == 4) 453 | return i + 1; 454 | } 455 | 456 | return -1; 457 | } 458 | 459 | /** 460 | *

Returns an HTTP response object parsed from the ARC record payload.

461 | *

Note: The payload is parsed on-demand, but is only parsed once. The 462 | * parsed data is saved for subsequent calls.

463 | * 464 | * @return The ARC record payload as an HTTP response object. See the Apache 465 | * HttpComponents project. 466 | */ 467 | public HttpResponse getHttpResponse() 468 | throws IOException, HttpException { 469 | 470 | if (this._httpResponse != null) 471 | return this._httpResponse; 472 | 473 | if (this._payload == null) { 474 | LOG.error("Unable to parse HTTP response: Payload has not been set"); return null; 475 | } 476 | 477 | if (this._url != null && !this._url.startsWith("http://") && !this._url.startsWith("https://")) { 478 | LOG.error("Unable to parse HTTP response: URL protocol is not HTTP"); return null; 479 | } 480 | 481 | this._httpResponse = null; 482 | 483 | // Find where the HTTP headers stop 484 | int end = this._searchForCRLFCRLF(this._payload); 485 | 486 | if (end == -1) { 487 | LOG.error("Unable to parse HTTP response: End of HTTP headers not found"); return null; 488 | } 489 | 490 | // Parse the HTTP status line and headers 491 | DefaultHttpResponseParser parser = 492 | new DefaultHttpResponseParser( 493 | new ByteArraySessionInputBuffer(this._payload, 0, end), 494 | new BasicLineParser(), 495 | new DefaultHttpResponseFactory(), 496 | new BasicHttpParams() 497 | ); 498 | 499 | this._httpResponse = parser.parse(); 500 | 501 | if (this._httpResponse == null) { 502 | LOG.error("Unable to parse HTTP response"); return null; 503 | } 504 | 505 | // Set the reset of the payload as the HTTP entity. Use an InputStreamEntity 506 | // to avoid a memory copy. 507 | InputStreamEntity entity = new InputStreamEntity(new ByteArrayInputStream(this._payload, end, this._payload.length - end), this._payload.length - end); 508 | entity.setContentType(this._httpResponse.getFirstHeader("Content-Type")); 509 | entity.setContentEncoding(this._httpResponse.getFirstHeader("Content-Encoding")); 510 | this._httpResponse.setEntity(entity); 511 | 512 | return this._httpResponse; 513 | } 514 | 515 | /** 516 | *

Returns a Jsoup HTML document, parsed using the Charset in the 517 | * "Content-Type" header. If the document charset cannot be found, parse is 518 | * attempted using

519 | * 520 | * @return A Jsoup parsed HTML document from the HTTP response content. 521 | */ 522 | public Document getParsedHTML() 523 | throws IOException { 524 | 525 | if (this._url == null) { 526 | LOG.error("Unable to parse HTML: URL from ARC header has not been set"); 527 | return null; 528 | } 529 | 530 | // if response has not been parsed yet, this parses it 531 | try { 532 | this.getHttpResponse(); 533 | } 534 | catch (HttpException ex) { 535 | LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null; 536 | } 537 | 538 | if (this._httpResponse == null) { 539 | LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null; 540 | } 541 | 542 | if (this._httpResponse.getEntity() == null) { 543 | LOG.error("Unable to parse HTML: No HTTP response entity found"); return null; 544 | } 545 | 546 | if (!this._contentType.toLowerCase().contains("html")) { 547 | LOG.warn("Unable to parse HTML: Content is not HTML"); return null; 548 | } 549 | 550 | String charset = null; 551 | 552 | try { 553 | // Default value returned is "text/plain" with charset of ISO-8859-1. 554 | charset = ContentType.getOrDefault(this._httpResponse.getEntity()).getCharset().name(); 555 | } 556 | catch (Throwable ex) { 557 | 558 | } 559 | 560 | // if anything goes wrong, try ISO-8859-1 561 | if (charset == null) 562 | charset = "ISO-8859-1"; 563 | 564 | // parse the content using the derived charset and the URL from the ARC header 565 | return Jsoup.parse(this._httpResponse.getEntity().getContent(), charset, this._url); 566 | } 567 | } 568 | 569 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.EOFException; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | 9 | import java.lang.Math; 10 | import java.lang.StringBuffer; 11 | import java.util.Arrays; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.FSDataInputStream; 15 | import org.apache.hadoop.fs.FileSystem; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapred.FileSplit; 19 | import org.apache.hadoop.mapred.RecordReader; 20 | 21 | import org.apache.log4j.Logger; 22 | 23 | import org.commoncrawl.compressors.gzip.GzipCompressorInputStream; 24 | 25 | /** 26 | * Reads ARC records. 27 | * 28 | * Set "io.file.buffer.size" to define the amount of data that should be 29 | * buffered from S3. 30 | */ 31 | public class ArcRecordReader 32 | implements RecordReader { 33 | 34 | private static final Logger LOG = Logger.getLogger(ArcRecordReader.class); 35 | 36 | private FSDataInputStream _fsin; 37 | private GzipCompressorInputStream _gzip; 38 | private long _fileLength; 39 | 40 | /** 41 | * 42 | */ 43 | public ArcRecordReader(Configuration job, FileSplit split) 44 | throws IOException { 45 | 46 | if (split.getStart() != 0) { 47 | IOException ex = new IOException("Invalid ARC file split start " + split.getStart() + ": ARC files are not splittable"); 48 | LOG.error(ex.getMessage()); 49 | throw ex; 50 | } 51 | 52 | // open the file and seek to the start of the split 53 | final Path file = split.getPath(); 54 | 55 | FileSystem fs = file.getFileSystem(job); 56 | 57 | this._fsin = fs.open(file); 58 | 59 | // create a GZIP stream that *does not* automatically read through members 60 | this._gzip = new GzipCompressorInputStream(this._fsin, false); 61 | 62 | this._fileLength = fs.getFileStatus(file).getLen(); 63 | 64 | // First record should be an ARC file header record. Skip it. 65 | this._skipRecord(); 66 | } 67 | 68 | /** 69 | * Skips the current record, and advances to the next GZIP member. 70 | */ 71 | private void _skipRecord() 72 | throws IOException { 73 | 74 | long n = 0; 75 | 76 | do { 77 | n = this._gzip.skip(999999999); 78 | } 79 | while (n > 0); 80 | 81 | this._gzip.nextMember(); 82 | } 83 | 84 | /** 85 | * @inheritDoc 86 | */ 87 | public Text createKey() { 88 | return new Text(); 89 | } 90 | 91 | /** 92 | * @inheritDoc 93 | */ 94 | public ArcRecord createValue() { 95 | return new ArcRecord(); 96 | } 97 | 98 | private static byte[] _checkBuffer = new byte[64]; 99 | 100 | /** 101 | * 102 | */ 103 | public synchronized boolean next(Text key, ArcRecord value) 104 | throws IOException { 105 | 106 | boolean isValid = true; 107 | 108 | // try reading an ARC record from the stream 109 | try { 110 | isValid = value.readFrom(this._gzip); 111 | } 112 | catch (EOFException ex) { 113 | return false; 114 | } 115 | 116 | // if the record is not valid, skip it 117 | if (isValid == false) { 118 | LOG.error("Invalid ARC record found at GZIP position "+this._gzip.getBytesRead()+". Skipping ..."); 119 | this._skipRecord(); 120 | return true; 121 | } 122 | 123 | if (value.getURL() != null) 124 | key.set(value.getURL()); 125 | 126 | // check to make sure we've reached the end of the GZIP member 127 | int n = this._gzip.read(_checkBuffer, 0, 64); 128 | 129 | if (n != -1) { 130 | LOG.error(n+" bytes of unexpected content found at end of ARC record. Skipping ..."); 131 | this._skipRecord(); 132 | } 133 | else { 134 | this._gzip.nextMember(); 135 | } 136 | 137 | return true; 138 | } 139 | 140 | /** 141 | * @inheritDoc 142 | */ 143 | public float getProgress() 144 | throws IOException { 145 | return Math.min(1.0f, this._gzip.getBytesRead() / (float) this._fileLength); 146 | } 147 | 148 | /** 149 | * @inheritDoc 150 | */ 151 | public synchronized long getPos() 152 | throws IOException { 153 | return this._gzip.getBytesRead(); 154 | } 155 | 156 | /** 157 | * @inheritDoc 158 | */ 159 | public synchronized void close() 160 | throws IOException { 161 | 162 | if (this._gzip != null) 163 | this._gzip.close(); 164 | } 165 | 166 | } 167 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.commoncrawl.nutch.tools.arc; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.hadoop.io.BytesWritable; 22 | import org.apache.hadoop.io.Text; 23 | import org.apache.hadoop.mapred.FileInputFormat; 24 | import org.apache.hadoop.mapred.FileSplit; 25 | import org.apache.hadoop.mapred.InputSplit; 26 | import org.apache.hadoop.mapred.JobConf; 27 | import org.apache.hadoop.mapred.RecordReader; 28 | import org.apache.hadoop.mapred.Reporter; 29 | 30 | /** 31 | * A input format the reads arc files. 32 | */ 33 | public class ArcInputFormat 34 | extends FileInputFormat { 35 | 36 | /** 37 | * Returns the RecordReader for reading the arc file. 38 | * 39 | * @param split The InputSplit of the arc file to process. 40 | * @param job The job configuration. 41 | * @param reporter The progress reporter. 42 | */ 43 | public RecordReader getRecordReader(InputSplit split, 44 | JobConf job, Reporter reporter) 45 | throws IOException { 46 | reporter.setStatus(split.toString()); 47 | return new ArcRecordReader(job, (FileSplit)split); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.commoncrawl.nutch.tools.arc; 18 | 19 | import java.io.ByteArrayOutputStream; 20 | import java.io.IOException; 21 | import java.util.zip.GZIPInputStream; 22 | 23 | // - modified by Common Crawl - 24 | //import org.slf4j.Logger; 25 | //import org.slf4j.LoggerFactory; 26 | import org.apache.log4j.Logger; 27 | 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FSDataInputStream; 30 | import org.apache.hadoop.fs.FileSystem; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.io.BytesWritable; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.hadoop.mapred.FileSplit; 35 | import org.apache.hadoop.mapred.RecordReader; 36 | import org.apache.hadoop.util.ReflectionUtils; 37 | import org.apache.hadoop.util.StringUtils; 38 | 39 | /** 40 | *

The ArcRecordReader class provides a record reader which 41 | * reads records from arc files.

42 | * 43 | *

Arc files are essentially tars of gzips. Each record in an arc file is 44 | * a compressed gzip. Multiple records are concatenated together to form a 45 | * complete arc. For more information on the arc file format see 46 | * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .

47 | * 48 | *

Arc files are used by the internet archive and grub projects.

49 | * 50 | * see {@link http://www.archive.org/ } 51 | * see {@link http://www.grub.org/ } 52 | */ 53 | public class ArcRecordReader 54 | implements RecordReader { 55 | 56 | private static final Logger LOG = Logger.getLogger(ArcRecordReader.class); 57 | 58 | protected Configuration conf; 59 | protected long splitStart = 0; 60 | protected long pos = 0; 61 | protected long splitEnd = 0; 62 | protected long splitLen = 0; 63 | protected long fileLen = 0; 64 | protected FSDataInputStream in; 65 | 66 | private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B}; 67 | 68 | /** 69 | *

Returns true if the byte array passed matches the gzip header magic 70 | * number.

71 | * 72 | * @param input The byte array to check. 73 | * 74 | * @return True if the byte array matches the gzip header magic number. 75 | */ 76 | public static boolean isMagic(byte[] input) { 77 | 78 | // check for null and incorrect length 79 | if (input == null || input.length != MAGIC.length) { 80 | return false; 81 | } 82 | 83 | // check byte by byte 84 | for (int i = 0; i < MAGIC.length; i++) { 85 | if (MAGIC[i] != input[i]) { 86 | return false; 87 | } 88 | } 89 | 90 | // must match 91 | return true; 92 | } 93 | 94 | /** 95 | * Constructor that sets the configuration and file split. 96 | * 97 | * @param conf The job configuration. 98 | * @param split The file split to read from. 99 | * 100 | * @throws IOException If an IO error occurs while initializing file split. 101 | */ 102 | public ArcRecordReader(Configuration conf, FileSplit split) 103 | throws IOException { 104 | 105 | Path path = split.getPath(); 106 | FileSystem fs = path.getFileSystem(conf); 107 | fileLen = fs.getFileStatus(split.getPath()).getLen(); 108 | this.conf = conf; 109 | this.in = fs.open(split.getPath()); 110 | this.splitStart = split.getStart(); 111 | this.splitEnd = splitStart + split.getLength(); 112 | this.splitLen = split.getLength(); 113 | in.seek(splitStart); 114 | } 115 | 116 | /** 117 | * Closes the record reader resources. 118 | */ 119 | public void close() 120 | throws IOException { 121 | this.in.close(); 122 | } 123 | 124 | /** 125 | * Creates a new instance of the Text object for the key. 126 | */ 127 | public Text createKey() { 128 | return (Text)ReflectionUtils.newInstance(Text.class, conf); 129 | } 130 | 131 | /** 132 | * Creates a new instance of the BytesWritable object for the key 133 | */ 134 | public BytesWritable createValue() { 135 | return (BytesWritable)ReflectionUtils.newInstance(BytesWritable.class, conf); 136 | } 137 | 138 | /** 139 | * Returns the current position in the file. 140 | * 141 | * @return The long of the current position in the file. 142 | */ 143 | public long getPos() 144 | throws IOException { 145 | return in.getPos(); 146 | } 147 | 148 | /** 149 | * Returns the percentage of progress in processing the file. This will be 150 | * represented as a float from 0 to 1 with 1 being 100% completed. 151 | * 152 | * @return The percentage of progress as a float from 0 to 1. 153 | */ 154 | public float getProgress() 155 | throws IOException { 156 | 157 | // if we haven't even started 158 | if (splitEnd == splitStart) { 159 | return 0.0f; 160 | } 161 | else { 162 | // the progress is current pos - where we started / length of the split 163 | return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen); 164 | } 165 | } 166 | 167 | /** 168 | *

Returns true if the next record in the split is read into the key and 169 | * value pair. The key will be the arc record header and the values will be 170 | * the raw content bytes of the arc record.

171 | * 172 | * @param key The record key 173 | * @param value The record value 174 | * 175 | * @return True if the next record is read. 176 | * 177 | * @throws IOException If an error occurs while reading the record value. 178 | */ 179 | public boolean next(Text key, BytesWritable value) 180 | throws IOException { 181 | 182 | try { 183 | 184 | // get the starting position on the input stream 185 | long startRead = in.getPos(); 186 | byte[] magicBuffer = null; 187 | 188 | // we need this loop to handle false positives in reading of gzip records 189 | while (true) { 190 | 191 | // while we haven't passed the end of the split 192 | if (startRead >= splitEnd) { 193 | return false; 194 | } 195 | 196 | // scanning for the gzip header 197 | boolean foundStart = false; 198 | while (!foundStart) { 199 | 200 | // start at the current file position and scan for 1K at time, break 201 | // if there is no more to read 202 | startRead = in.getPos(); 203 | magicBuffer = new byte[1024]; 204 | int read = in.read(magicBuffer); 205 | if (read < 0) { 206 | break; 207 | } 208 | 209 | // scan the byte array for the gzip header magic number. This happens 210 | // byte by byte 211 | for (int i = 0; i < read - 1; i++) { 212 | byte[] testMagic = new byte[2]; 213 | System.arraycopy(magicBuffer, i, testMagic, 0, 2); 214 | if (isMagic(testMagic)) { 215 | // set the next start to the current gzip header 216 | startRead += i; 217 | foundStart = true; 218 | break; 219 | } 220 | } 221 | } 222 | 223 | // seek to the start of the gzip header 224 | in.seek(startRead); 225 | ByteArrayOutputStream baos = null; 226 | int totalRead = 0; 227 | 228 | try { 229 | 230 | // read 4K of the gzip at a time putting into a byte array 231 | byte[] buffer = new byte[4096]; 232 | GZIPInputStream zin = new GZIPInputStream(in); 233 | int gzipRead = -1; 234 | baos = new ByteArrayOutputStream(); 235 | while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) { 236 | baos.write(buffer, 0, gzipRead); 237 | totalRead += gzipRead; 238 | } 239 | } 240 | catch (Exception e) { 241 | 242 | // there are times we get false positives where the gzip header exists 243 | // but it is not an actual gzip record, so we ignore it and start 244 | // over seeking 245 | // LOG.debug("Ignoring position: " + (startRead)); 246 | if (startRead + 1 < fileLen) { 247 | in.seek(startRead + 1); 248 | } 249 | continue; 250 | } 251 | 252 | // change the output stream to a byte array 253 | byte[] content = baos.toByteArray(); 254 | 255 | // the first line of the raw content in arc files is the header 256 | int eol = 0; 257 | for (int i = 0; i < content.length; i++) { 258 | if (i > 0 && content[i] == '\n') { 259 | eol = i; 260 | break; 261 | } 262 | } 263 | 264 | // create the header and the raw content minus the header 265 | String header = new String(content, 0, eol).trim(); 266 | byte[] raw = new byte[(content.length - eol) - 1]; 267 | System.arraycopy(content, eol + 1, raw, 0, raw.length); 268 | 269 | // populate key and values with the header and raw content. 270 | Text keyText = (Text)key; 271 | keyText.set(header); 272 | BytesWritable valueBytes = (BytesWritable)value; 273 | valueBytes.set(raw, 0, raw.length); 274 | 275 | // TODO: It would be best to start at the end of the gzip read but 276 | // the bytes read in gzip don't match raw bytes in the file so we 277 | // overshoot the next header. With this current method you get 278 | // some false positives but don't miss records. 279 | if (startRead + 1 < fileLen) { 280 | in.seek(startRead + 1); 281 | } 282 | 283 | // populated the record, now return 284 | return true; 285 | } 286 | } 287 | catch (Exception e) { 288 | LOG.equals(StringUtils.stringifyException(e)); 289 | } 290 | 291 | // couldn't populate the record or there is no next record to read 292 | return false; 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseMap.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'open3' 5 | require 'uri' 6 | 7 | # Inline these classes so we don't have to copy a file while bootstrapping 8 | class ArcRecord 9 | attr_accessor :num, :url, :ip_address, :archive_date, :content_type, :content_length, :content 10 | end 11 | 12 | class ArcFile 13 | 14 | include Enumerable 15 | 16 | def initialize(input_stream) 17 | @handle=input_stream 18 | end 19 | 20 | def each 21 | return self.to_enum unless block_given? 22 | begin 23 | # See http://www.archive.org/web/researcher/ArcFileFormat.php 24 | # for information about the ARC format once it is decompressed 25 | file_header = @handle.readline.strip 26 | @handle.read(Integer(file_header.split.last)) 27 | i=1 28 | 29 | loop do 30 | begin 31 | fields = @handle.readline.strip.split(" ") 32 | raise "Invalid ARC record header found" if fields.length != 5 33 | warn("Invalid protocol in ARC record header") if not fields[0].to_s.start_with?("http://", "https://") 34 | 35 | record = ArcRecord.new 36 | record.num = i 37 | record.url = fields[0].to_s 38 | record.ip_address = fields[1].to_s 39 | record.archive_date = fields[2].to_s 40 | record.content_type = fields[3].to_s 41 | record.content_length = Integer(fields[4]) 42 | record.content = @handle.read(record.content_length) 43 | i = i+1 44 | 45 | yield record 46 | 47 | rescue EOFError 48 | break nil 49 | end 50 | end 51 | #rescue 52 | # raise "#{self.class}: Record ##{i} - Error - #{$!}" 53 | end 54 | end 55 | 56 | end 57 | 58 | CHUNKSIZE=1024*1024 59 | 60 | # All warnings will end up in the EMR stderr logs. 61 | warn("Starting up GZIP process, piping #{CHUNKSIZE/1024}KB chunks at a time") 62 | 63 | # Ruby GzipReader is unable to unzip these files, but unix gunzip can 64 | # Also means we don't need to eat much RAM, because everything is streaming. 65 | Open3.popen3('gunzip -c') {|sin,sout,serr,thr| 66 | 67 | # Create an ArcFile instance which will receive gunzip's stdout 68 | arcfile = ArcFile.new(sout) 69 | 70 | Thread.new do 71 | loop do 72 | begin 73 | chunk = STDIN.readpartial(CHUNKSIZE) 74 | sin.write(chunk) 75 | Thread.pass() 76 | rescue EOFError 77 | warn("End of input, flushing and closing stream to GZIP") 78 | sin.close() # which will send an EOF to the ArcFile 79 | break nil 80 | end 81 | end 82 | end 83 | 84 | # Now we have a lazy ArcFile that we can treat as an Enumerable. 85 | arcfile.each {|record| 86 | if record 87 | begin 88 | # work around Ruby URI library's lack of support for URLs with underscore 89 | uri = URI.parse(record.url.delete("_")) 90 | STDOUT.puts(uri.host.downcase()) 91 | rescue URI::InvalidURIError 92 | warn("ARC file contains invalid URL: "+record.url) 93 | next 94 | end 95 | end 96 | } 97 | } 98 | 99 | -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseReduce.rb: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env ruby 2 | 3 | curr = nil 4 | sum = 0 5 | 6 | ARGF.each do |line| 7 | 8 | # the entire line is the key 9 | key = line.chomp 10 | 11 | # if the current key hasn't been set yet, set it 12 | if !curr 13 | 14 | curr = key 15 | sum = 0 16 | 17 | # if a new key is found, emit the current key ... 18 | elsif key != curr && sum > 0 19 | 20 | if sum > 2 21 | STDOUT.puts(curr + "\t" + sum.to_s()) 22 | end 23 | 24 | # ... then set up a new key 25 | curr = key 26 | sum = 0 27 | 28 | end 29 | 30 | # add to count for this current key 31 | sum += 1 32 | 33 | end 34 | -------------------------------------------------------------------------------- /src/ruby/README: -------------------------------------------------------------------------------- 1 | common_crawl_types 2 | 3 | Ben Nagy wrote the original code for this project, and posted it inline to the 4 | Common Crawl mailing list. I tidied it up and wrote a how-to guide: 5 | http://petewarden.typepad.com/searchbrowser/2012/03/twelve-steps-to-running-your-ruby-code-across-five-billion-web-pages.html 6 | 7 | Ben's original message is below. 8 | 9 | Pete Warden, pete@petewarden.com 10 | 11 | ------------------------------------------------------------------- 12 | 13 | Hi, 14 | 15 | So I found this a bit of a pain, so I thought I'd share. If you want 16 | to mess with the Common Crawl stuff but don't feel like learning Java, 17 | this might be for you. 18 | 19 | I'm sure that this could be easily adapted for other streaming 20 | languages, once you work out how to read requester-pays buckets. 21 | 22 | First up, see this: 23 | http://arfon.org/getting-started-with-elastic-mapreduce-and-hadoop-streaming 24 | 25 | Which has basic information and nice screenshots about EMR Streaming, 26 | setting up the job, bootstrapping and such. 27 | 28 | To install the AWS Ruby SDK on an EMR instance you'll need to 29 | bootstrap some stuff. Some of the packages might not be necessary, but 30 | it was a bit of a pain to trim down from a working set of basic 31 | packages. 32 | 33 | (see setup.sh) 34 | 35 | OK, now we're ready for the mapper. This example just collects 36 | mimetypes and URL extensions. The key bits are the ArcFile class and 37 | the monkeypatch to make requester-pays work. I'm not particularly 38 | proud of this monkeypatch, by the way, but the SDK code is a bit 39 | baffling, and it looked like too much work to patch it properly. 40 | 41 | This mapper expects a file manifest as input, one arc.gz url to read 42 | per line. By doing this you avoid the problem of weird splits, or 43 | having hadoop automatically trying to gunzip the file and failing. It 44 | should look like: 45 | 46 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380159663_9.arc.gz 47 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380179515_9.arc.gz 48 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380199363_9.arc.gz 49 | 50 | You can get those names with the SDK, once you add the monkeypatch 51 | below, or with a patched version of s3cmd ls, the instructions for 52 | which have been posted here before. 53 | 54 | (see extension_map.rb) 55 | 56 | And finally, a trivial reducer 57 | 58 | (see extension_reduce.rb) 59 | 60 | IMHO you only need one of these puppies, which you can achieve by 61 | adding '-D mapred.reduce.tasks=1' to your job args 62 | 63 | If it all worked you should get something like this in your output 64 | directory: 65 | 66 | text/html : 4365 67 | text/html .html : 4256 68 | text/xml : 43 69 | text/html .aspx : 16 70 | text/html .com : 2 71 | text/plain .txt : 1 72 | 73 | Except with more entries, that is just an example based on one file. 74 | 75 | For those interested in costs / timings, I finished 2010/9/24/9 (790 76 | files) in 5h57m, or 30 normalised instance hours of m1.small, with 1 77 | master and 4 core instances. The same job with 1 m1.small master and 78 | 2x cc1.4xlarge core was done in 1h31m, for 66 normalised instance 79 | hours. I'll let you do your individual maths and avoid drawing any 80 | conclusions. If anyone has additional (solid) performance data 81 | comparing various cluster configs for identical workloads then that 82 | might be useful. As an aside, my map tasks took from 9 minutes to 45 83 | minutes to complete, but the average was probably ~33 (eyeball). 84 | 85 | Anyway, hope this helps someone. 86 | 87 | Cheers, 88 | 89 | ben 90 | -------------------------------------------------------------------------------- /test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java: -------------------------------------------------------------------------------- 1 | package org.commoncrawl.hadoop.mapred; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.InputStream; 5 | import java.io.IOException; 6 | import java.lang.StringBuilder; 7 | 8 | import junit.framework.TestCase; 9 | import static junit.framework.Assert.*; 10 | 11 | /** 12 | * Unit Tests for jUnit 3.8 13 | */ 14 | public class TestArcRecordCC extends TestCase { 15 | 16 | ArcRecordCC r; 17 | 18 | /* 19 | public static junit.framework.Test suite() { 20 | return new junit.framework.JUnit4TestAdapter(TestArcRecordCC.class); 21 | } 22 | */ 23 | 24 | public InputStream getPayload1() 25 | throws Exception { 26 | 27 | StringBuilder s = new StringBuilder(); 28 | 29 | s.setLength(0); 30 | s.append("\n"); 31 | s.append(" \n"); 32 | s.append(" This is a web page!\n"); 33 | s.append(" \n"); 34 | s.append(" \n"); 35 | s.append("

This is some content!

\n"); 36 | s.append(" \n"); 37 | s.append(""); 38 | 39 | String content = s.toString(); 40 | 41 | s.setLength(0); 42 | s.append("HTTP/1.1 200 OK\r\n"); 43 | s.append("Date: Fri, 31 Dec 1999 23:59:59 GMT\r\n"); 44 | s.append("Content-Type: text/html; charset=utf-8\r\n"); 45 | s.append("\r\n"); 46 | s.append(content); 47 | s.insert(0, "http://www.example.com/path/file.php?param=123,456%20789 123.123.123.123 20120235131415 text/html "+(s.length()-3)+"\n"); 48 | 49 | return new ByteArrayInputStream(s.toString().getBytes("UTF-8")); 50 | } 51 | 52 | public void setUp() { 53 | r = new ArcRecordCC(); 54 | } 55 | 56 | public void test_getIpAddress() 57 | throws Exception { 58 | r.readFrom(this.getPayload1()); 59 | assertEquals(r.getIpAddress(), "123.123.123.123"); 60 | } 61 | 62 | public void test_getHttpHeaders() 63 | throws Exception { 64 | r.readFrom(this.getPayload1()); 65 | assertEquals(r.getHttpResponse().getFirstHeader("Content-Type").getValue(), "text/html; charset=utf-8"); 66 | } 67 | 68 | public void test_getHttpResponse_getEntity() 69 | throws Exception { 70 | 71 | r.readFrom(this.getPayload1()); 72 | assertNotNull(r.getHttpResponse().getEntity()); 73 | 74 | byte[] buffer = new byte[1000]; 75 | r.getHttpResponse().getEntity().getContent().read(buffer, 0, 1000); 76 | 77 | StringBuilder s = new StringBuilder(); 78 | s.append("\n"); 79 | s.append(" \n"); 80 | s.append(" This is a web page!\n"); 81 | s.append(" \n"); 82 | s.append(" \n"); 83 | s.append("

This is some content!

\n"); 84 | s.append(" \n"); 85 | s.append(""); 86 | 87 | String v1 = s.toString(); 88 | String v2 = new String(buffer, "UTF-8"); 89 | 90 | assertEquals(v1.trim(), v2.trim()); 91 | } 92 | 93 | public void test_getParsedHTML() 94 | throws Exception { 95 | r.readFrom(this.getPayload1()); 96 | assertNotNull(r.getParsedHTML()); 97 | } 98 | } 99 | --------------------------------------------------------------------------------