├── .gitignore
├── README-Amazon-AMI
├── VERSION
├── bin
    ├── ccCopyToHDFS
    ├── ccListInvalidSegments
    └── ccRunExample
├── build.properties
├── build.xml
├── conf
    └── mapred.xml
├── lib
    ├── gson-2.2.1.jar
    ├── guava-12.0.jar
    ├── httpcore-4.2.1.jar
    └── jsoup-1.6.3.jar
├── src
    ├── java
    │   └── org
    │   │   └── commoncrawl
    │   │       ├── compressors
    │   │           ├── CompressorInputStream.java
    │   │           └── gzip
    │   │           │   └── GzipCompressorInputStream.java
    │   │       ├── examples
    │   │           ├── ExampleArcMicroformat.java
    │   │           ├── ExampleMetadataDomainPageCount.java
    │   │           ├── ExampleMetadataStats.java
    │   │           └── ExampleTextWordCount.java
    │   │       ├── hadoop
    │   │           └── mapred
    │   │           │   ├── ArcInputFormat.java
    │   │           │   ├── ArcRecord.java
    │   │           │   └── ArcRecordReader.java
    │   │       └── nutch
    │   │           └── tools
    │   │               └── arc
    │   │                   ├── ArcInputFormat.java
    │   │                   └── ArcRecordReader.java
    └── ruby
    │   ├── ExampleArcParseMap.rb
    │   ├── ExampleArcParseReduce.rb
    │   └── README
└── test
    └── java
        └── org
            └── commoncrawl
                └── hadoop
                    └── mapred
                        └── TestArcRecordCC.java


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | build-test
3 | dist
4 | output
5 | 


--------------------------------------------------------------------------------
/README-Amazon-AMI:
--------------------------------------------------------------------------------
 1 | Common Crawl Quick Start Amazon AMI
 2 | -----------------------------------
 3 | 
 4 | Welcome to the Common Crawl Quick Start Amazon AMI!
 5 | 
 6 | The Common Crawl corpus is a copy of billions of web documents and their
 7 | metadata, stored as an Amazon S3 Public Dataset and available for analysis.
 8 | 
 9 | Here are the steps you need to follow to run your first job against the 
10 | Common Crawl corpus:
11 | 
12 | 1. Find your Amazon Access Credentials (Amazon Access ID & Amazon Secret Key)
13 | and save as two lines in this file:
14 | 
15 |   /home/ec2-user/.awssecret
16 | 
17 | For example:
18 | 
19 |   JLASKHJFLKDHJLFKSJDF
20 |   DFHSDJHhhoiaGKHDFa6sd42rwuhfapgfuAGSDAjh 
21 | 
22 | Change the permissions of this file to read/write only by 'ec2-user':
23 | 
24 |   chmod 600 /home/ec2-user/.awssecret
25 | 
26 | Now you can use Tim Kay's AWS Command Line tool.  Try this:
27 | 
28 |   aws ls -1 aws-publicdatasets/common-crawl/parse-output/segment/1341690167474/metadata-
29 | 
30 | If you are planning on using the local Hadoop cluster, you should also consider
31 | setting these properties in /etc/hadoop/hadoop-site.xml:
32 | 
33 |   fs.s3n.awsAccessKeyId
34 |   fs.s3n.awsSecretAccessKey
35 | 
36 | 2. Move to the 'commoncrawl-examples' directory.  Make sure it is up-to-date:
37 | 
38 |   cd ~/commoncrawl-examples; git pull
39 | 
40 | 3. Compile the latest example code:
41 | 
42 |   ant
43 | 
44 | 4. Run an example!  Decide whether you want to run an example on the small local
45 | Hadoop instance or on Amazon Elastic MapReduce.
46 | 
47 | Run this command to see your options:
48 | 
49 |   bin/ccRunExample
50 | 
51 | then go ahead and run an example:
52 | 
53 |   bin/ccRunExample LocalHadoop ExampleMetadataDomainPageCount
54 | 
55 | then look at the code:
56 | 
57 |   nano src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java
58 | 
59 | Note:  You need to have your own Amazon S3 bucket to run Amazon Elastic
60 | MapReduce jobs.
61 | 
62 | -----------------------------------
63 | 
64 | You can read all of this again in $HOME/commoncrawl-examples/README-Amazon-AMI.
65 | 
66 | Have fun!
67 | 
68 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.1
2 | 


--------------------------------------------------------------------------------
/bin/ccCopyToHDFS:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -aeu
 2 | 
 3 | usage() {
 4 |   echo ""
 5 |   echo "$(basename $0) ( Save To Path [ # of Files to Download ] )"
 6 |   echo ""
 7 |   echo "i.e. $(basename $0) hdfs://localhost/common-crawl 25"
 8 |   echo ""
 9 |   exit 1
10 | }
11 | 
12 | echo 
13 | echo "-----------------------------------------------------------------"
14 | echo "* "
15 | echo "* Common Crawl Data Downloader"
16 | echo "* "
17 | echo "-----------------------------------------------------------------"
18 | 
19 | if [ ! -r ~/.awssecret ]; then
20 |   echo ""
21 |   echo "ERROR: Please create a readable '.awssecret' file in your home directory."
22 |   echo ""
23 |   echo "The first line should be your AWS Access ID."
24 |   echo ""
25 |   echo "The second line should be your AWS Secret Key."
26 |   echo ""
27 |   exit 1
28 | fi
29 | 
30 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret)
31 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret)
32 | 
33 | CC_PATH="s3n://aws-publicdatasets/common-crawl/parse-output"
34 | 
35 | if [ $# -le 0 ]; then
36 |   usage
37 |   exit 0
38 | fi
39 | 
40 | if [ $# -ge 1 ]; then
41 |   OUTPUT_PATH="$1"
42 | fi
43 | 
44 | if [ $# -ge 2 ]; then
45 |   FILE_LIMIT="$2"
46 |   FILE_LIMIT_PARAM="-filelimit $2"
47 | else
48 |   FILE_LIMIT="-1"
49 |   FILE_LIMIT_PARAM=""
50 | fi
51 | 
52 | echo "INFO: Downloading list of valid segments"
53 | rm -f /tmp/cc-valid.txt
54 | 
55 | hadoop fs -get ${CC_PATH}/valid_segments.txt /tmp/cc-valid.txt
56 | 
57 | if [ ! -s /tmp/cc-valid.txt ]; then
58 |   echo "ERROR: Unable to download valid segments list"
59 |   exit 1
60 | fi
61 | 
62 | while read SEGMENT_ID; do
63 |   SOURCE_PATH="${CC_PATH}/segment/${SEGMENT_ID}"
64 |   TARGET_PATH="${OUTPUT_PATH}/segment/${SEGMENT_ID}"
65 |   echo "INFO: Running copy command for segment ${SEGMENT_ID}"
66 |   echo "
67 |   hadoop distcp \\
68 |     -Dfs.s3n.awsAccessKeyId=\"**********\" -Dfs.s3n.awsSecretAccessKey=\"**********\" \\
69 |     -i ${FILE_LIMIT_PARAM} \\
70 |     ${SOURCE_PATH} \\
71 |     ${TARGET_PATH}
72 |   "
73 |   hadoop distcp \
74 |     -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \
75 |     -i ${FILE_LIMIT_PARAM} \
76 |     ${SOURCE_PATH} \
77 |     ${TARGET_PATH}
78 | 
79 |   if [ ${FILE_LIMIT} -gt 0 ]; then
80 |     break
81 |   fi
82 | 
83 | done < /tmp/cc-valid.txt
84 | 
85 | 


--------------------------------------------------------------------------------
/bin/ccListInvalidSegments:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -aeu 
 2 | 
 3 | echo ""
 4 | echo "> gathering valid segments"
 5 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/valid_segments | cut -d" " -f 17- | sort > /tmp/cc-valid.txt
 6 | sed -i "s/valid_segments/segment/" /tmp/cc-valid.txt
 7 | 
 8 | echo "> gathering all segments published"
 9 | hadoop fs -ls s3n://aws-publicdatasets/common-crawl/parse-output/segment        | cut -d" " -f 17- | sort > /tmp/cc-all.txt
10 | echo ""
11 | 
12 | echo "* "
13 | echo "* List of Invalid Segments"
14 | echo "* "
15 | diff -b -w /tmp/cc-all.txt /tmp/cc-valid.txt | fgrep "segment" | sed "s/< /hadoop fs -rmr s3n:\/\/aws-publicdatasets/"
16 | 
17 | #rm -f /tmp/cc-all.txt
18 | #rm -f /tmp/cc-valid.txt
19 | 
20 | 


--------------------------------------------------------------------------------
/bin/ccRunExample:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -aeu
  2 | 
  3 | BASE_PATH=`dirname $0`"/.."
  4 | BASE_PATH=`cd ${BASE_PATH}; pwd`
  5 | 
  6 | VERSION="$(cat ${BASE_PATH}/VERSION)"
  7 | 
  8 | HDFS_LOCAL_HOSTNAME="localhost"
  9 | MAIN_JAR="commoncrawl-examples-${VERSION}.jar"
 10 | EXAMPLES_PATH="src/java/org/commoncrawl/examples"
 11 | EXAMPLES_PKG="org.commoncrawl.examples"
 12 | 
 13 | LOCAL_JAR_PATH="${BASE_PATH}/dist/lib"
 14 | 
 15 | usage() {
 16 |   echo ""
 17 |   echo "$(basename $0) [ LocalHadoop | AmazonEMR ] [ ExampleName ] ( S3Bucket )"
 18 |   echo ""
 19 |   echo "Please pass in one of the following examples: "
 20 |   echo ""
 21 |   ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//'
 22 |   echo ""
 23 |   exit 1
 24 | }
 25 | 
 26 | echo 
 27 | echo "-----------------------------------------------------------------"
 28 | echo "* "
 29 | echo "* Common Crawl Example Library Runner"
 30 | echo "* "
 31 | echo "-----------------------------------------------------------------"
 32 | 
 33 | if [ ! -r ~/.awssecret ]; then
 34 |   echo ""
 35 |   echo "ERROR: Please create a readable '.awssecret' file in your home directory."
 36 |   echo ""
 37 |   echo "The first line should be your AWS Access ID."
 38 |   echo ""
 39 |   echo "The second line should be your AWS Secret Key."
 40 |   echo ""
 41 |   exit 1
 42 | fi
 43 | 
 44 | AWS_ACCESS_ID=$(head -n 1 ~/.awssecret)
 45 | AWS_SECRET_KEY=$(tail -n 1 ~/.awssecret)
 46 | 
 47 | if [ ! -e ${LOCAL_JAR_PATH}/${MAIN_JAR} ]; then
 48 |   echo ""
 49 |   echo "ERROR: Please run the command 'ant' to build '${MAIN_JAR}' before attempting to run an example."
 50 |   echo ""
 51 |   exit 1
 52 | fi
 53 | 
 54 | # run the example provided on the command line
 55 | if [ $# -lt 2 ]; then
 56 |   usage
 57 | fi
 58 | 
 59 | RUN_TYPE="$1"
 60 | EXAMPLE="$2"
 61 | 
 62 | # run the selected example
 63 | if [ ! -f ${BASE_PATH}/${EXAMPLES_PATH}/${EXAMPLE}.java ]; then
 64 |   echo ""
 65 |   echo "ERROR: Cannot run example '${EXAMPLE}' - not found."
 66 |   echo ""
 67 |   echo "Please run one of the following:"
 68 |   echo ""
 69 |   ls ${BASE_PATH}/${EXAMPLES_PATH} | sed 's/\.java$//'
 70 |   echo ""
 71 |   exit 1
 72 | fi
 73 | 
 74 | if [ "${RUN_TYPE}" = "AmazonEMR" ]; then
 75 | 
 76 |   if [ $# -lt 3 ]; then
 77 |     echo ""
 78 |     echo "ERROR: To run an Amazon Elastic MapReduce job, you must supply an S3 bucket "
 79 |     echo "       that you have permissions to write files to."
 80 |     echo ""
 81 |     usage
 82 |   fi
 83 | 
 84 |   S3_USER_BUCKET="$3"
 85 | 
 86 |   EMR_JAR_PATH="${S3_USER_BUCKET}/emr/jars"
 87 |   EMR_LOG_PATH="${S3_USER_BUCKET}/emr/logs"
 88 |   EMR_OUTPUT_PATH="${S3_USER_BUCKET}/emr/output/${EXAMPLE}"
 89 | 
 90 |   echo "* "
 91 |   echo "* Uploading JAR + Config to S3 '${EMR_JAR_PATH}'"
 92 |   echo "* "
 93 |   echo aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR}
 94 |   aws put ${EMR_JAR_PATH}/${MAIN_JAR} ${LOCAL_JAR_PATH}/${MAIN_JAR}
 95 |   echo ""
 96 | 
 97 |   LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv"
 98 | 
 99 |   # We've found that a single, high-memory instance works well for the master,
100 |   # which runs the JobTracker
101 |   MASTER_TYPE="m1.large"   # consider using MASTER_TYPE="m2.4xlarge"
102 |   CORE_TYPE="m1.large"     # consider using CORE_TYPE="m2.2xlarge"
103 | 
104 |   # We've found the 'c1.xlarge' instance type to be most efficient for EMR
105 |   # jobs - though we are open to suggestions!
106 |   TASK_TYPE="c1.xlarge"    # EMR = +$0.12 per instance hour
107 | 
108 |   INSTANCES=4
109 | 
110 |   BID="0.08"
111 | 
112 |   TIMESTAMP=$(date +%Y%m%d_%H%M%S)
113 |   JOBNAME="Common_Crawl_${EXAMPLE}__${TIMESTAMP}"
114 | 
115 |   echo "-----------------------------------------------------------------"
116 |   echo "* "
117 |   echo "* Running Example '${EXAMPLE}'"
118 |   echo "* "
119 |   echo "* Starting Amazon Elastic MapReduce Job"
120 |   echo "* "
121 |   echo "-----------------------------------------------------------------"
122 | 
123 |   # Add in this option to specify a certain number of reducers:
124 |   #
125 |   #  --arg "-Dmapred.reduce.tasks=${REDUCERS}" \
126 |   #
127 | 
128 |   # if the line breaks don't work, join the following lines and remove all '\'
129 |   echo \
130 |   /opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \
131 |     --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \
132 |     --log-uri "s3n://${EMR_LOG_PATH}" \
133 |     --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \
134 |     --access-id "********" --private-key "********" \
135 |     --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \
136 |     --arg "-Dmapred.max.map.failures.percent=50" \
137 |     --arg "s3n://${EMR_OUTPUT_PATH}" \
138 |     --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \
139 |     --instance-group core   --instance-type "${CORE_TYPE}"   --instance-count 1 \
140 |     --instance-group task   --instance-type "${TASK_TYPE}"   --instance-count ${INSTANCES} --bid-price ${BID}
141 |   echo ""
142 | 
143 |   set +e
144 | 
145 |   THIS_PID=$$
146 | 
147 |   EMR_JOB_ID=$(/opt/aws/emr/elastic-mapreduce --create --plain-output --name "${JOBNAME}" --ami-version="2.1.1" --hadoop-version="0.20.205" \
148 |     --jar "s3n://${EMR_JAR_PATH}/${MAIN_JAR}" --step-name "Run_${EXAMPLE}" \
149 |     --log-uri "s3n://${EMR_LOG_PATH}" \
150 |     --main-class "${EXAMPLES_PKG}.${EXAMPLE}" \
151 |     --access-id "${AWS_ACCESS_ID}" --private-key "${AWS_SECRET_KEY}" \
152 |     --arg "-Dmapreduce.job.split.metainfo.maxsize=-1" \
153 |     --arg "-Dmapred.max.map.failures.percent=50" \
154 |     --arg "s3n://${EMR_OUTPUT_PATH}" \
155 |     --instance-group master --instance-type "${MASTER_TYPE}" --instance-count 1 \
156 |     --instance-group core   --instance-type "${CORE_TYPE}"   --instance-count 1 \
157 |     --instance-group task   --instance-type "${TASK_TYPE}"   --instance-count ${INSTANCES} --bid-price ${BID})
158 | 
159 |   RC=$?
160 | 
161 |   set -e
162 | 
163 |   if [ $RC -ne 0 ]; then
164 |     echo "WARNING: Amazon EMR returned non-zero status code: $RC"
165 |   fi
166 | 
167 |   if [ -z "${EMR_JOB_ID}" ]; then
168 |     echo "WARNING: Unable to determine EMR Job ID"
169 |     EMR_JOB_ID="[Amazon EMR Job ID]"
170 |   fi
171 | 
172 |   echo ""
173 |   echo "-----------------------------------------------------------------"
174 |   echo "* "
175 |   echo "* Your Amazon Elastic MapReduce job has been launched. "
176 |   echo "* "
177 |   echo "* Please look for '${JOBNAME}'"
178 |   echo "* in your AWS Web Console."
179 |   echo "* "
180 |   echo "* Once the job has completed, run the following command to view "
181 |   echo "* log files: "
182 |   echo "* "
183 |   echo "*   hadoop dfs -get s3n://${EMR_LOG_PATH}/${EMR_JOB_ID} ${BASE_PATH}/logs"
184 |   echo "* "
185 |   echo "* and the following command to pull down the output files: "
186 |   echo "* "
187 |   echo "*   hadoop fs -getmerge s3n://${EMR_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}"
188 |   echo "* "
189 |   echo "-----------------------------------------------------------------"
190 | 
191 |   mkdir -p ${BASE_PATH}/logs
192 | 
193 |   exit ${RC}
194 | 
195 | fi
196 | 
197 | if [ "${RUN_TYPE}" = "LocalHadoop" ]; then
198 | 
199 |   MAPRED_OUTPUT_PATH="hdfs://${HDFS_LOCAL_HOSTNAME}/user/${USER}/output/${EXAMPLE}"
200 |   LOCAL_OUTPUT_PATH="${BASE_PATH}/output/${EXAMPLE}.tsv"
201 | 
202 |   echo "* "
203 |   echo "* Running Example '${EXAMPLE}'"
204 |   echo "* "
205 |   echo "-----------------------------------------------------------------"
206 |   echo hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \
207 |     ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml
208 |   echo ""
209 | 
210 |   hadoop jar ${LOCAL_JAR_PATH}/${MAIN_JAR} ${EXAMPLES_PKG}.${EXAMPLE} \
211 |     -Dfs.s3.awsAccessKeyId="${AWS_ACCESS_ID}"  -Dfs.s3.awsSecretAccessKey="${AWS_SECRET_KEY}" \
212 |     -Dfs.s3n.awsAccessKeyId="${AWS_ACCESS_ID}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" \
213 |     ${MAPRED_OUTPUT_PATH} ${BASE_PATH}/conf/mapred.xml 
214 | 
215 |   RC=$?
216 | 
217 |   if [ $RC -ne 0 ]; then
218 |     echo "-----------------------------------------------------------------"
219 |     echo "* "
220 |     echo "* There was a problem running '${EXAMPLE}'."
221 |     echo "* "
222 |     echo "* Please contact 'info@commoncrawl.org'."
223 |     echo "* "
224 |     echo "-----------------------------------------------------------------"
225 |     exit $RC
226 |   fi
227 | 
228 |   echo "-----------------------------------------------------------------"
229 |   echo "* "
230 |   echo "* Your MapReduce job '${EXAMPLE}' completed successfully!"
231 |   echo "* "
232 |   echo "* Copying output to the local file system:"
233 |   echo "* "
234 |   echo 
235 |   rm -f ${LOCAL_OUTPUT_PATH}
236 |   echo hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}
237 |   hadoop fs -getmerge ${MAPRED_OUTPUT_PATH} ${LOCAL_OUTPUT_PATH}
238 |   echo
239 |   echo "* "
240 |   echo "* You can see the results of your job here:"
241 |   echo "* "
242 |   echo "*   ${LOCAL_OUTPUT_PATH}"
243 |   echo "* "
244 |   echo "* Here are the first 15 lines of output:"
245 |   echo "* "
246 |   echo "-------------------------------------------------------------"
247 |   echo 
248 |   head -n 15 ${LOCAL_OUTPUT_PATH}
249 |   echo 
250 | 
251 |   exit 0
252 | 
253 | fi
254 | 
255 | 


--------------------------------------------------------------------------------
/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Common Crawl Examples - Build Configuration Parameters
3 | #
4 | 
5 | # Path to Hadoop libraries
6 | hadoop.path=/usr/share/hadoop
7 | 
8 | 


--------------------------------------------------------------------------------
/build.xml:
--------------------------------------------------------------------------------
  1 | <project name="Common Crawl Examples" default="dist" basedir=".">
  2 | 
  3 |   <description>
  4 |     Common Crawl Examples Build File
  5 |   </description>
  6 | 
  7 |   <!-- set global properties for this build -->
  8 |   <property name="name" value="commoncrawl-examples" />
  9 |   <loadfile srcfile="${basedir}/VERSION" property="version">
 10 |     <filterchain>
 11 |       <striplinebreaks />
 12 |     </filterchain>
 13 |   </loadfile>
 14 | 
 15 |   <!-- include any user specific or environment specifc build properties -->
 16 |   <property file="${user.home}/build.properties"/>
 17 |   <property file="${basedir}/build.properties"/>
 18 | 
 19 |   <!-- ensure that 'hadoop.path' is set -->
 20 |   <fail message="Please define the 'hadoop.path' property in this 'build.properties' file">
 21 |     <condition>
 22 |       <not>
 23 |         <isset property="hadoop.path"/>
 24 |       </not>
 25 |     </condition>
 26 |   </fail>
 27 | 
 28 |   <property name="lib"   location="lib"  />
 29 |   <property name="src"   location="src"  />
 30 |   <property name="build" location="build"/>
 31 |   <property name="dist"  location="dist" />
 32 |   <property name="test"  location="test" />
 33 | 
 34 |   <target name="init">
 35 |     <tstamp/>
 36 |     <mkdir dir="${build}"/>
 37 |   </target>
 38 | 
 39 |   <target name="compile" depends="init"
 40 |           description="compile the source" >
 41 |     <echo message=""/>
 42 |     <echo message="Building '${name}': Version ${version}"/>
 43 |     <echo message=""/>
 44 |     <javac srcdir="${src}" destdir="${build}" debug="on" debuglevel="lines,vars,source" target="1.6">
 45 |       <compilerarg value="-Xlint"/>
 46 |       <classpath>
 47 |         <pathelement path="${classpath}"/>
 48 |         <fileset dir="${hadoop.path}">
 49 |           <include name="**/hadoop-core-*.jar"/>
 50 |           <include name="**/log4j-*.jar"/>
 51 |           <include name="**/junit-*.jar"/>
 52 |         </fileset>
 53 |         <fileset dir="lib">
 54 |           <include name="**/*.jar"/>
 55 |         </fileset>
 56 |       </classpath>
 57 |     </javac>
 58 |   </target>
 59 | 
 60 |   <target name="dist" depends="compile"
 61 |           description="generate the distribution" >
 62 |     <mkdir dir="${dist}/lib"/>
 63 |     <mkdir dir="${build}/lib"/>
 64 |     <jar jarfile="${dist}/lib/${name}-${version}.jar" basedir="${build}">
 65 |       <zipfileset dir="lib" prefix="lib" >
 66 |         <include name="**/*.jar" />
 67 |       </zipfileset>
 68 |     </jar>
 69 |   </target>
 70 | 
 71 |   <target name="compile-tests" depends="dist"
 72 |           description="compile test cases" >
 73 |     <mkdir dir="${build}-test"/>
 74 |     <javac srcdir="${test}" destdir="${build}-test" debug="on" debuglevel="lines,vars,source" target="1.6">
 75 |       <compilerarg value="-Xlint"/>
 76 |       <classpath>
 77 |         <pathelement path="${classpath}"/>
 78 |         <pathelement path="/usr/share/java/junit.jar"/>
 79 |         <fileset dir="${hadoop.path}">
 80 |           <include name="**/hadoop-core-*.jar"/>
 81 |           <include name="**/log4j-*.jar"/>
 82 |         </fileset>
 83 |         <fileset dir="lib">
 84 |           <include name="**/*.jar"/>
 85 |         </fileset>
 86 |         <pathelement path="${dist}/lib/${name}-${version}.jar"/>
 87 |       </classpath>
 88 |     </javac>
 89 |   </target>
 90 | 
 91 |   <target name="test" depends="compile-tests"
 92 |           description="run Junit test cases" >
 93 |     <junit printsummary="true">
 94 |       <classpath>
 95 |         <pathelement path="${classpath}"/>
 96 |         <pathelement path="/usr/share/java/junit.jar"/>
 97 |         <fileset dir="${hadoop.path}">
 98 |           <include name="**/hadoop-core-*.jar"/>
 99 |           <include name="**/log4j-*.jar"/>
100 |         </fileset>
101 |         <fileset dir="lib">
102 |           <include name="**/*.jar"/>
103 |         </fileset>
104 |         <pathelement path="${dist}/lib/${name}-${version}.jar"/>
105 |         <pathelement path="${build}-test"/>
106 |       </classpath>
107 |       <batchtest>
108 |         <fileset dir="${build}-test"/>
109 |         <formatter type="plain" usefile="false"/>
110 |       </batchtest>
111 |     </junit>
112 |   </target>
113 | 
114 |   <target name="clean"
115 |           description="clean up" >
116 |     <!-- Delete the ${build} and ${dist} directory trees -->
117 |     <delete dir="${build}"/>
118 |     <delete dir="${build}-test"/>
119 |     <delete dir="${dist}"/>
120 |   </target>
121 | </project>
122 | 


--------------------------------------------------------------------------------
/conf/mapred.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <property>
 3 |     <name>mapred.map.tasks</name>
 4 |     <value>1</value>
 5 |   </property>
 6 |   <property>
 7 |     <name>mapred.max.map.failures.percent</name>
 8 |     <value>10</value>
 9 |   </property>
10 | </configuration>
11 | 


--------------------------------------------------------------------------------
/lib/gson-2.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/gson-2.2.1.jar


--------------------------------------------------------------------------------
/lib/guava-12.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/guava-12.0.jar


--------------------------------------------------------------------------------
/lib/httpcore-4.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/httpcore-4.2.1.jar


--------------------------------------------------------------------------------
/lib/jsoup-1.6.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/3223e656603cf6db14abdff5f5d08d34f9d12e61/lib/jsoup-1.6.3.jar


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/compressors/CompressorInputStream.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  * http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | // package modified by Common Crawl - no other changes from
21 | // Apache Commons Compress 1.4.1
22 | package org.commoncrawl.compressors;
23 | 
24 | import java.io.InputStream;
25 | 
26 | public abstract class CompressorInputStream extends InputStream {
27 |     private long bytesRead = 0;
28 | 
29 |     /**
30 |      * Increments the counter of already read bytes.
31 |      * Doesn't increment if the EOF has been hit (read == -1)
32 |      * 
33 |      * @param read the number of bytes read
34 |      *
35 |      * @since 1.1
36 |      */
37 |     protected void count(int read) {
38 |         count((long) read);
39 |     }
40 | 
41 |     /**
42 |      * Increments the counter of already read bytes.
43 |      * Doesn't increment if the EOF has been hit (read == -1)
44 |      * 
45 |      * @param read the number of bytes read
46 |      */
47 |     protected void count(long read) {
48 |         if(read != -1) {
49 |             bytesRead = bytesRead + read;
50 |         }
51 |     }
52 | 
53 |     /**
54 |      * Returns the current number of bytes read from this stream.
55 |      * @return the number of read bytes
56 |      * @deprecated this method may yield wrong results for large
57 |      * archives, use #getBytesRead instead
58 |      */
59 |     @Deprecated
60 |     public int getCount() {
61 |         return (int) bytesRead;
62 |     }
63 | 
64 |     /**
65 |      * Returns the current number of bytes read from this stream.
66 |      * @return the number of read bytes
67 |      *
68 |      * @since 1.1
69 |      */
70 |     public long getBytesRead() {
71 |         return bytesRead;
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | // Source code taken from Apache Commons Compress 1.4.1.  Feature is being
 21 | // submitted to Apache project - patch will be applied if approved.
 22 | package org.commoncrawl.compressors.gzip;
 23 | 
 24 | import java.io.IOException;
 25 | import java.io.EOFException;
 26 | import java.io.InputStream;
 27 | import java.io.DataInputStream;
 28 | import java.io.BufferedInputStream;
 29 | import java.util.zip.DataFormatException;
 30 | import java.util.zip.Inflater;
 31 | import java.util.zip.CRC32;
 32 | 
 33 | import org.commoncrawl.compressors.CompressorInputStream;
 34 | 
 35 | /**
 36 |  * Input stream that decompresses .gz files.
 37 |  * This supports decompressing concatenated .gz files which is important
 38 |  * when decompressing standalone .gz files.
 39 |  * <p>
 40 |  * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
 41 |  * files: it stops after the first member and silently ignores the rest.
 42 |  * It doesn't leave the read position to point to the beginning of the next
 43 |  * member, which makes it difficult workaround the lack of concatenation
 44 |  * support.
 45 |  * <p>
 46 |  * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
 47 |  * container format decoder. The actual decompression is done with
 48 |  * {@link java.util.zip.Inflater}.
 49 |  */
 50 | public class GzipCompressorInputStream extends CompressorInputStream {
 51 |     // Header flags
 52 |     // private static final int FTEXT = 0x01; // Uninteresting for us
 53 |     private static final int FHCRC = 0x02;
 54 |     private static final int FEXTRA = 0x04;
 55 |     private static final int FNAME = 0x08;
 56 |     private static final int FCOMMENT = 0x10;
 57 |     private static final int FRESERVED = 0xE0;
 58 | 
 59 |     // Compressed input stream, possibly wrapped in a BufferedInputStream
 60 |     private final InputStream in;
 61 | 
 62 |     // True if decompressing multimember streams.
 63 |     private final boolean decompressConcatenated;
 64 | 
 65 |     // Buffer to hold the input data
 66 |     private final byte[] buf = new byte[8192];
 67 | 
 68 |     // Amount of data in buf.
 69 |     private int bufUsed = 0;
 70 | 
 71 |     // Decompressor
 72 |     private Inflater inf = new Inflater(true);
 73 | 
 74 |     // CRC32 from uncompressed data
 75 |     private CRC32 crc = new CRC32();
 76 | 
 77 |     private int memberSize;
 78 | 
 79 |     // True once the end of a member has been reached and
 80 |     // 'decompressConcatenated' is false.
 81 |     private boolean stoppedForEndOfMember = false;
 82 | 
 83 |     // True once the end of stream has been reached.
 84 |     private boolean endOfStream = false;
 85 | 
 86 |     /**
 87 |      * Constructs a new input stream that decompresses gzip-compressed data
 88 |      * from the specified input stream.
 89 |      * <p>
 90 |      * This is equivalent to
 91 |      * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
 92 |      * will not decompress concatenated .gz files.
 93 |      *
 94 |      * @param inputStream  the InputStream from which this object should
 95 |      *                     be created of
 96 |      *
 97 |      * @throws IOException if the stream could not be created
 98 |      */
 99 |     public GzipCompressorInputStream(InputStream inputStream)
100 |             throws IOException {
101 |         this(inputStream, false);
102 |     }
103 | 
104 |     /**
105 |      * Constructs a new input stream that decompresses gzip-compressed data
106 |      * from the specified input stream.
107 |      * <p>
108 |      * If <code>decompressConcatenated</code> is {@code false}:
109 |      * This decompressor might read more input than it will actually use.
110 |      * If <code>inputStream</code> supports <code>mark</code> and
111 |      * <code>reset</code>, then the input position will be adjusted
112 |      * so that it is right after the last byte of the compressed stream.
113 |      * If <code>mark</code> isn't supported, the input position will be
114 |      * undefined.
115 |      *
116 |      * @param inputStream  the InputStream from which this object should
117 |      *                     be created of
118 |      * @param decompressConcatenated
119 |      *                     if true, decompress until the end of the input;
120 |      *                     if false, stop after the first .gz member
121 |      *
122 |      * @throws IOException if the stream could not be created
123 |      */
124 |     public GzipCompressorInputStream(InputStream inputStream,
125 |                                      boolean decompressConcatenated)
126 |             throws IOException {
127 |         // Mark support is strictly needed for concatenated files only,
128 |         // but it's simpler if it is always available.
129 |         if (inputStream.markSupported()) {
130 |             in = inputStream;
131 |         } else {
132 |             in = new BufferedInputStream(inputStream);
133 |         }
134 | 
135 |         this.decompressConcatenated = decompressConcatenated;
136 |         init(true);
137 |     }
138 | 
139 |     private boolean init(boolean isFirstMember) throws IOException {
140 |         assert isFirstMember || decompressConcatenated;
141 | 
142 |         // Check the magic bytes without a possibility of EOFException.
143 |         int magic0 = in.read();
144 |         int magic1 = in.read();
145 | 
146 |         // If end of input was reached after decompressing at least
147 |         // one .gz member, we have reached the end of the file successfully.
148 |         if (magic0 == -1 && !isFirstMember) {
149 |             endOfStream = true;
150 |             return false;
151 |         }
152 | 
153 |         if (magic0 != 31 || magic1 != 139) {
154 |             throw new IOException(isFirstMember
155 |                                   ? "Input is not in the .gz format"
156 |                                   : "Garbage after a valid .gz stream");
157 |         }
158 | 
159 |         // Parsing the rest of the header may throw EOFException.
160 |         DataInputStream inData = new DataInputStream(in);
161 |         int method = inData.readUnsignedByte();
162 |         if (method != 8) {
163 |             throw new IOException("Unsupported compression method "
164 |                                   + method + " in the .gz header");
165 |         }
166 | 
167 |         int flg = inData.readUnsignedByte();
168 |         if ((flg & FRESERVED) != 0) {
169 |             throw new IOException(
170 |                     "Reserved flags are set in the .gz header");
171 |         }
172 | 
173 |         inData.readInt(); // mtime, ignored
174 |         inData.readUnsignedByte(); // extra flags, ignored
175 |         inData.readUnsignedByte(); // operating system, ignored
176 | 
177 |         // Extra field, ignored
178 |         if ((flg & FEXTRA) != 0) {
179 |             int xlen = inData.readUnsignedByte();
180 |             xlen |= inData.readUnsignedByte() << 8;
181 | 
182 |             // This isn't as efficient as calling in.skip would be,
183 |             // but it's lazier to handle unexpected end of input this way.
184 |             // Most files don't have an extra field anyway.
185 |             while (xlen-- > 0) {
186 |                 inData.readUnsignedByte();
187 |             }
188 |         }
189 | 
190 |         // Original file name, ignored
191 |         if ((flg & FNAME) != 0) {
192 |             readToNull(inData);
193 |         }
194 | 
195 |         // Comment, ignored
196 |         if ((flg & FCOMMENT) != 0) {
197 |             readToNull(inData);
198 |         }
199 | 
200 |         // Header "CRC16" which is actually a truncated CRC32 (which isn't
201 |         // as good as real CRC16). I don't know if any encoder implementation
202 |         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
203 |         // doesn't support this field, but zlib seems to be able to at least
204 |         // skip over it.
205 |         if ((flg & FHCRC) != 0) {
206 |             inData.readShort();
207 |         }
208 | 
209 |         // Reset
210 |         inf.reset();
211 |         crc.reset();
212 |         memberSize = 0;
213 | 
214 |         return true;
215 |     }
216 | 
217 |     private void readToNull(DataInputStream inData) throws IOException {
218 |         while (inData.readUnsignedByte() != 0x00) {}
219 |     }
220 | 
221 |     /** {@inheritDoc} */
222 |     @Override
223 |     public int read() throws IOException {
224 |         byte[] buf = new byte[1];
225 |         return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
226 |     }
227 | 
228 |     /**
229 |      * {@inheritDoc}
230 |      *
231 |      * @since 1.1
232 |      */
233 |     @Override
234 |     public int read(byte[] b, int off, int len) throws IOException {
235 | 
236 |         if (stoppedForEndOfMember || endOfStream) {
237 |             return -1;
238 |         }
239 | 
240 |         int size = 0;
241 | 
242 |         while (len > 0) {
243 |             if (inf.needsInput()) {
244 |                 // Remember the current position because we may need to
245 |                 // rewind after reading too much input.
246 |                 in.mark(buf.length);
247 | 
248 |                 bufUsed = in.read(buf);
249 |                 if (bufUsed == -1) {
250 |                     throw new EOFException();
251 |                 }
252 | 
253 |                 inf.setInput(buf, 0, bufUsed);
254 |             }
255 | 
256 |             int ret;
257 |             try {
258 |                 ret = inf.inflate(b, off, len);
259 |             } catch (DataFormatException e) {
260 |                 throw new IOException("Gzip-compressed data is corrupt");
261 |             }
262 | 
263 |             crc.update(b, off, ret);
264 |             memberSize += ret;
265 |             off += ret;
266 |             len -= ret;
267 |             size += ret;
268 |             count(ret);
269 | 
270 |             if (inf.finished()) {
271 |                 // We may have read too many bytes. Rewind the read
272 |                 // position to match the actual amount used.
273 |                 //
274 |                 // NOTE: The "if" is there just in case. Since we used
275 |                 // in.mark earler, it should always skip enough.
276 |                 in.reset();
277 | 
278 |                 int skipAmount = bufUsed - inf.getRemaining();
279 |                 if (in.skip(skipAmount) != skipAmount) {
280 |                     throw new IOException();
281 |                 }
282 | 
283 |                 bufUsed = 0;
284 | 
285 |                 DataInputStream inData = new DataInputStream(in);
286 | 
287 |                 // CRC32
288 |                 long crcStored = 0;
289 |                 for (int i = 0; i < 4; ++i) {
290 |                     crcStored |= (long)inData.readUnsignedByte() << (i * 8);
291 |                 }
292 | 
293 |                 if (crcStored != crc.getValue()) {
294 |                     throw new IOException("Gzip-compressed data is corrupt "
295 |                                           + "(CRC32 error)");
296 |                 }
297 | 
298 |                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
299 |                 int isize = 0;
300 |                 for (int i = 0; i < 4; ++i) {
301 |                     isize |= inData.readUnsignedByte() << (i * 8);
302 |                 }
303 | 
304 |                 if (isize != memberSize) {
305 |                     throw new IOException("Gzip-compressed data is corrupt"
306 |                                           + "(uncompressed size mismatch)");
307 |                 }
308 | 
309 | 
310 |                 if (!decompressConcatenated) {
311 |                     stoppedForEndOfMember = true;
312 |                 }
313 |                 
314 |                 // See if this is the end of the file.
315 |                 endOfStream = !init(false);
316 | 
317 |                 if (stoppedForEndOfMember || endOfStream) {
318 |                     return size == 0 ? -1 : size;
319 |                 }
320 |             }
321 |         }
322 | 
323 |         return size;
324 |     }
325 | 
326 |     /**
327 |      * Checks if the signature matches what is expected for a .gz file.
328 |      *
329 |      * @param signature the bytes to check
330 |      * @param length    the number of bytes to check
331 |      * @return          true if this is a .gz stream, false otherwise
332 |      *
333 |      * @since 1.1
334 |      */
335 |     public static boolean matches(byte[] signature, int length) {
336 | 
337 |         if (length < 2) {
338 |             return false;
339 |         }
340 | 
341 |         if (signature[0] != 31) {
342 |             return false;
343 |         }
344 | 
345 |         if (signature[1] != -117) {
346 |             return false;
347 |         }
348 | 
349 |         return true;
350 |     }
351 | 
352 |     /**
353 |      * Closes the input stream (unless it is System.in).
354 |      *
355 |      * @since 1.2
356 |      */
357 |     @Override
358 |     public void close() throws IOException {
359 |         if (inf != null) {
360 |             inf.end();
361 |             inf = null;
362 |         }
363 | 
364 |         if (this.in != System.in) {
365 |             this.in.close();
366 |         }
367 |     }
368 | 
369 |     /**
370 |      * Explicitly instructs the stream to allow an additional concatenated
371 |      * member to be read.
372 |      *
373 |      * @since 1.x.x
374 |      */
375 |     public boolean nextMember() {
376 | 
377 |         if (endOfStream)
378 |             return false;
379 | 
380 |         stoppedForEndOfMember = false;
381 | 
382 |         return true;
383 |     }
384 | }
385 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/examples/ExampleArcMicroformat.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.examples;
  2 | 
  3 | // Java classes
  4 | import java.lang.IllegalArgumentException;
  5 | import java.lang.Integer;
  6 | import java.lang.Math;
  7 | import java.lang.OutOfMemoryError;
  8 | import java.io.BufferedReader;
  9 | import java.io.ByteArrayInputStream;
 10 | import java.io.DataOutputStream;
 11 | import java.io.File;
 12 | import java.io.FileReader;
 13 | import java.io.IOException;
 14 | import java.net.URI;
 15 | import java.util.Arrays;
 16 | 
 17 | // log4j classes
 18 | import org.apache.log4j.Logger;
 19 | 
 20 | // Hadoop classes
 21 | import org.apache.hadoop.conf.Configured;
 22 | import org.apache.hadoop.conf.Configuration;
 23 | import org.apache.hadoop.fs.FSDataOutputStream;
 24 | import org.apache.hadoop.fs.FileStatus;
 25 | import org.apache.hadoop.fs.FileSystem;
 26 | import org.apache.hadoop.fs.Path;
 27 | import org.apache.hadoop.fs.PathFilter;
 28 | import org.apache.hadoop.io.LongWritable;
 29 | import org.apache.hadoop.io.Text;
 30 | import org.apache.hadoop.mapred.FileInputFormat;
 31 | import org.apache.hadoop.mapred.FileOutputFormat;
 32 | import org.apache.hadoop.mapred.InputSplit;
 33 | import org.apache.hadoop.mapred.JobClient;
 34 | import org.apache.hadoop.mapred.JobConf;
 35 | import org.apache.hadoop.mapred.Mapper;
 36 | import org.apache.hadoop.mapred.MapReduceBase;
 37 | import org.apache.hadoop.mapred.OutputCollector;
 38 | import org.apache.hadoop.mapred.Reporter;
 39 | import org.apache.hadoop.mapred.TextOutputFormat;
 40 | import org.apache.hadoop.mapred.lib.LongSumReducer;
 41 | import org.apache.hadoop.util.Progressable;
 42 | import org.apache.hadoop.util.Tool;
 43 | import org.apache.hadoop.util.ToolRunner;
 44 | 
 45 | // Common Crawl classes
 46 | import org.commoncrawl.hadoop.mapred.ArcInputFormat;
 47 | import org.commoncrawl.hadoop.mapred.ArcRecord;
 48 | 
 49 | // jsoup classes
 50 | import org.jsoup.Jsoup;
 51 | import org.jsoup.nodes.Document;
 52 | import org.jsoup.nodes.Element;
 53 | import org.jsoup.select.Elements;
 54 | 
 55 | /**
 56 |  * An example showing how to analyze the Common Crawl ARC web content files.
 57 |  * 
 58 |  * @author Chris Stephens <chris@commoncrawl.org>
 59 |  */
 60 | public class ExampleArcMicroformat
 61 |     extends    Configured
 62 |     implements Tool {
 63 | 
 64 |   private static final Logger LOG = Logger.getLogger(ExampleArcMicroformat.class);
 65 | 
 66 |   /**
 67 |    * Maps incoming web documents to a list of Microformat 'itemtype' tags.
 68 |    * Filters out any non-HTML pages.
 69 |    *
 70 |    * @author Chris Stephens <chris@commoncrawl.org>
 71 |    *
 72 |    * Inspired by:
 73 |    *
 74 |    * @author Manu Sporny 
 75 |    * @author Steve Salevan
 76 |    */
 77 |   public static class ExampleArcMicroformatMapper
 78 |       extends    MapReduceBase
 79 |       implements Mapper<Text, ArcRecord, Text, LongWritable> {
 80 |  
 81 |     // create a counter group for Mapper-specific statistics
 82 |     private final String _counterGroup = "Custom Mapper Counters";
 83 | 
 84 |     public void map(Text key, ArcRecord value, OutputCollector<Text, LongWritable> output, Reporter reporter)
 85 |         throws IOException {
 86 | 
 87 |       try {
 88 | 
 89 |         if (!value.getContentType().contains("html")) {
 90 |           reporter.incrCounter(this._counterGroup, "Skipped - Not HTML", 1);
 91 |           return;
 92 |         }
 93 | 
 94 |         // just curious how many of each content type we've seen
 95 |         reporter.incrCounter(this._counterGroup, "Content Type - "+value.getContentType(), 1);
 96 | 
 97 |         // ensure sample instances have enough memory to parse HTML
 98 |         if (value.getContentLength() > (5 * 1024 * 1024)) {
 99 |           reporter.incrCounter(this._counterGroup, "Skipped - HTML Too Long", 1);
100 |           return;
101 |         }
102 | 
103 |         // Count all 'itemtype' attributes referencing 'schema.org'
104 |         Document doc = value.getParsedHTML();
105 | 
106 |         if (doc == null) {
107 |           reporter.incrCounter(this._counterGroup, "Skipped - Unable to Parse HTML", 1);
108 |           return;
109 |         }
110 | 
111 |         Elements mf = doc.select("[itemtype~=schema.org]");
112 | 
113 |         if (mf.size() > 0) {
114 |           for (Element e : mf) {
115 |             if (e.hasAttr("itemtype")) {
116 |               output.collect(new Text(e.attr("itemtype").toLowerCase().trim()), new LongWritable(1));
117 |             }
118 |           }
119 |         }
120 |       }
121 |       catch (Throwable e) {
122 | 
123 |         // occassionally Jsoup parser runs out of memory ...
124 |         if (e.getClass().equals(OutOfMemoryError.class))
125 |           System.gc();
126 | 
127 |         LOG.error("Caught Exception", e);
128 |         reporter.incrCounter(this._counterGroup, "Skipped - Exception Thrown", 1);
129 |       }
130 |     }
131 |   }
132 | 
133 |   /**
134 |    * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the
135 |    * number of files processed.
136 |    *
137 |    * @author Chris Stephens <chris@commoncrawl.org>
138 |    */
139 |   public static class SampleFilter
140 |       implements PathFilter {
141 | 
142 |     private static int count =         0;
143 |     private static int max   = 999999999;
144 | 
145 |     public boolean accept(Path path) {
146 | 
147 |       if (!path.getName().endsWith(".arc.gz"))
148 |         return false;
149 | 
150 |       SampleFilter.count++;
151 | 
152 |       if (SampleFilter.count > SampleFilter.max)
153 |         return false;
154 | 
155 |       return true;
156 |     }
157 |   }
158 | 
159 |   /**
160 |    * Implmentation of Tool.run() method, which builds and runs the Hadoop job.
161 |    *
162 |    * @param  args command line parameters, less common Hadoop job parameters stripped
163 |    *              out and interpreted by the Tool class.  
164 |    * @return      0 if the Hadoop job completes successfully, 1 if not. 
165 |    */
166 |   @Override
167 |   public int run(String[] args)
168 |       throws Exception {
169 | 
170 |     String outputPath = null;
171 |     String configFile = null;
172 | 
173 |     // Read the command line arguments.
174 |     if (args.length <  1)
175 |       throw new IllegalArgumentException("Example JAR must be passed an output path.");
176 | 
177 |     outputPath = args[0];
178 | 
179 |     if (args.length >= 2)
180 |       configFile = args[1];
181 | 
182 |     // For this example, only look at a single ARC files.
183 |     String inputPath   = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690163490/1341782443295_1551.arc.gz";
184 |  
185 |     // Switch to this if you'd like to look at all ARC files.  May take many minutes just to read the file listing.
186 |   //String inputPath   = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/*.arc.gz";
187 | 
188 |     // Read in any additional config parameters.
189 |     if (configFile != null) {
190 |       LOG.info("adding config parameters from '"+ configFile + "'");
191 |       this.getConf().addResource(configFile);
192 |     }
193 | 
194 |     // Creates a new job configuration for this Hadoop job.
195 |     JobConf job = new JobConf(this.getConf());
196 | 
197 |     job.setJarByClass(ExampleArcMicroformat.class);
198 | 
199 |     // Scan the provided input path for ARC files.
200 |     LOG.info("setting input path to '"+ inputPath + "'");
201 |     FileInputFormat.addInputPath(job, new Path(inputPath));
202 |     FileInputFormat.setInputPathFilter(job, SampleFilter.class);
203 | 
204 |     // Delete the output path directory if it already exists.
205 |     LOG.info("clearing the output path at '" + outputPath + "'");
206 | 
207 |     FileSystem fs = FileSystem.get(new URI(outputPath), job);
208 | 
209 |     if (fs.exists(new Path(outputPath)))
210 |       fs.delete(new Path(outputPath), true);
211 | 
212 |     // Set the path where final output 'part' files will be saved.
213 |     LOG.info("setting output path to '" + outputPath + "'");
214 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
215 |     FileOutputFormat.setCompressOutput(job, false);
216 | 
217 |     // Set which InputFormat class to use.
218 |     job.setInputFormat(ArcInputFormat.class);
219 | 
220 |     // Set which OutputFormat class to use.
221 |     job.setOutputFormat(TextOutputFormat.class);
222 | 
223 |     // Set the output data types.
224 |     job.setOutputKeyClass(Text.class);
225 |     job.setOutputValueClass(LongWritable.class);
226 | 
227 |     // Set which Mapper and Reducer classes to use.
228 |     job.setMapperClass(ExampleArcMicroformat.ExampleArcMicroformatMapper.class);
229 |     job.setReducerClass(LongSumReducer.class);
230 | 
231 |     if (JobClient.runJob(job).isSuccessful())
232 |       return 0;
233 |     else
234 |       return 1;
235 |   }
236 | 
237 |   /**
238 |    * Main entry point that uses the {@link ToolRunner} class to run the example
239 |    * Hadoop job.
240 |    */
241 |   public static void main(String[] args)
242 |       throws Exception {
243 |     int res = ToolRunner.run(new Configuration(), new ExampleArcMicroformat(), args);
244 |     System.exit(res);
245 |   }
246 | }
247 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.examples;
  2 | 
  3 | // Java classes
  4 | import java.io.BufferedReader;
  5 | import java.io.DataOutputStream;
  6 | import java.io.File;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.net.URI;
 10 | 
 11 | // Apache Project classes
 12 | import org.apache.log4j.Logger;
 13 | 
 14 | // Hadoop classes
 15 | import org.apache.hadoop.conf.Configured;
 16 | import org.apache.hadoop.conf.Configuration;
 17 | import org.apache.hadoop.fs.FSDataOutputStream;
 18 | import org.apache.hadoop.fs.FileSystem;
 19 | import org.apache.hadoop.fs.Path;
 20 | import org.apache.hadoop.fs.PathFilter;
 21 | import org.apache.hadoop.io.LongWritable;
 22 | import org.apache.hadoop.io.Text;
 23 | import org.apache.hadoop.mapred.FileInputFormat;
 24 | import org.apache.hadoop.mapred.FileOutputFormat;
 25 | import org.apache.hadoop.mapred.InputSplit;
 26 | import org.apache.hadoop.mapred.JobClient;
 27 | import org.apache.hadoop.mapred.JobConf;
 28 | import org.apache.hadoop.mapred.Mapper;
 29 | import org.apache.hadoop.mapred.MapReduceBase;
 30 | import org.apache.hadoop.mapred.OutputCollector;
 31 | import org.apache.hadoop.mapred.Reporter;
 32 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
 33 | import org.apache.hadoop.mapred.TextOutputFormat;
 34 | import org.apache.hadoop.mapred.lib.LongSumReducer;
 35 | import org.apache.hadoop.util.Progressable;
 36 | import org.apache.hadoop.util.Tool;
 37 | import org.apache.hadoop.util.ToolRunner;
 38 | 
 39 | // Google Gson classes
 40 | import com.google.gson.Gson;
 41 | import com.google.gson.GsonBuilder;
 42 | import com.google.gson.JsonElement;
 43 | import com.google.gson.JsonObject;
 44 | import com.google.gson.JsonParser;
 45 | 
 46 | // Google Guava classes
 47 | import com.google.common.net.InternetDomainName;
 48 | 
 49 | /**
 50 |  * An example showing how to use the Common Crawl 'metadata' files to quickly
 51 |  * gather high level information about the corpus' content.
 52 |  * 
 53 |  * @author Chris Stephens <chris@commoncrawl.org>
 54 |  */
 55 | public class ExampleMetadataDomainPageCount
 56 |     extends    Configured
 57 |     implements Tool {
 58 | 
 59 |   private static final Logger LOG = Logger.getLogger(ExampleMetadataDomainPageCount.class);
 60 | 
 61 |   /**
 62 |    * Mapping class that produces the normalized domain name and a count of '1'
 63 |    * for every successfully retrieved URL in the Common Crawl corpus.
 64 |    */ 
 65 |   public static class ExampleMetadataDomainPageCountMapper
 66 |       extends    MapReduceBase
 67 |       implements Mapper<Text, Text, Text, LongWritable> {
 68 | 
 69 |     // create a counter group for Mapper-specific statistics
 70 |     private final String _counterGroup = "Custom Mapper Counters";
 71 | 
 72 |     // implement the main "map" function
 73 |     public void map(Text key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter)
 74 |         throws IOException {
 75 | 
 76 |       // key & value are "Text" right now ...
 77 |       String url   = key.toString();
 78 |       String json  = value.toString();
 79 | 
 80 |       try {
 81 | 
 82 |         // Get the base domain name
 83 |         URI uri = new URI(url);
 84 |         String host = uri.getHost();
 85 | 
 86 |         if (host == null) {
 87 |           reporter.incrCounter(this._counterGroup, "Invalid URI", 1);
 88 |           return;
 89 |         }
 90 | 
 91 |         InternetDomainName domainObj = InternetDomainName.from(host);
 92 | 
 93 |         String domain = domainObj.topPrivateDomain().name();
 94 | 
 95 |         if (domain == null) {
 96 |           reporter.incrCounter(this._counterGroup, "Invalid Domain", 1);
 97 |           return;
 98 |         }
 99 | 
100 |         // See if the page has a successful HTTP code
101 |         JsonParser jsonParser = new JsonParser();
102 |         JsonObject jsonObj    = jsonParser.parse(json).getAsJsonObject();
103 | 
104 |         int httpCode;
105 | 
106 |         if (jsonObj.has("http_result") == false) {
107 |           reporter.incrCounter(this._counterGroup, "HTTP Code Missing", 1);
108 |           return;
109 |         }
110 | 
111 |         if (jsonObj.get("http_result").getAsInt() == 200) {
112 |           reporter.incrCounter(this._counterGroup, "HTTP Success", 1);
113 | 
114 |           // only output counts for pages that were successfully retrieved
115 |           output.collect(new Text(domain), new LongWritable(1));
116 |         }
117 |         else {
118 |           reporter.incrCounter(this._counterGroup, "HTTP Not Success", 1);
119 |         }
120 |       }
121 |       catch (IOException ex) {
122 |         throw ex;
123 |       }
124 |       catch (Exception ex) {
125 |         LOG.error("Caught Exception", ex); 
126 |         reporter.incrCounter(this._counterGroup, "Exceptions", 1);
127 |       }
128 |     }
129 |   }
130 | 
131 | 
132 |   /**
133 |    * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the
134 |    * number of files processed.
135 |    *
136 |    * @author Chris Stephens <chris@commoncrawl.org>
137 |    */
138 |   public static class SampleFilter
139 |       implements PathFilter {
140 | 
141 |     private static int count =         0;
142 |     private static int max   = 999999999;
143 | 
144 |     public boolean accept(Path path) {
145 | 
146 |       if (!path.getName().startsWith("metadata-"))
147 |         return false;
148 | 
149 |       SampleFilter.count++;
150 | 
151 |       if (SampleFilter.count > SampleFilter.max)
152 |         return false;
153 | 
154 |       return true;
155 |     }
156 |   }
157 | 
158 |   /**
159 |    * Implmentation of Tool.run() method, which builds and runs the Hadoop job.
160 |    *
161 |    * @param  args command line parameters, less common Hadoop job parameters stripped
162 |    *              out and interpreted by the Tool class.  
163 |    * @return      0 if the Hadoop job completes successfully, 1 if not. 
164 |    */
165 |   @Override
166 |   public int run(String[] args)
167 |       throws Exception {
168 | 
169 |     String outputPath = null;
170 |     String configFile = null;
171 | 
172 |     // Read the command line arguments.
173 |     if (args.length <  1)
174 |       throw new IllegalArgumentException("Example JAR must be passed an output path.");
175 | 
176 |     outputPath = args[0];
177 | 
178 |     if (args.length >= 2)
179 |       configFile = args[1];
180 | 
181 |     // For this example, only look at a single metadata file.
182 |     String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690166822/metadata-01849";
183 |  
184 |     // Switch to this if you'd like to look at all metadata files.  May take many minutes just to read the file listing.
185 |     // String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/metadata-*";
186 | 
187 |     // Read in any additional config parameters.
188 |     if (configFile != null) {
189 |       LOG.info("adding config parameters from '"+ configFile + "'");
190 |       this.getConf().addResource(configFile);
191 |     }
192 | 
193 |     // Creates a new job configuration for this Hadoop job.
194 |     JobConf job = new JobConf(this.getConf());
195 | 
196 |     job.setJarByClass(ExampleMetadataDomainPageCount.class);
197 | 
198 |     // Scan the provided input path for ARC files.
199 |     LOG.info("setting input path to '"+ inputPath + "'");
200 |     FileInputFormat.addInputPath(job, new Path(inputPath));
201 | 
202 |     // Optionally, you can add in a custom input path filter
203 |     // FileInputFormat.setInputPathFilter(job, SampleFilter.class);
204 | 
205 |     // Delete the output path directory if it already exists.
206 |     LOG.info("clearing the output path at '" + outputPath + "'");
207 | 
208 |     FileSystem fs = FileSystem.get(new URI(outputPath), job);
209 | 
210 |     if (fs.exists(new Path(outputPath)))
211 |       fs.delete(new Path(outputPath), true);
212 | 
213 |     // Set the path where final output 'part' files will be saved.
214 |     LOG.info("setting output path to '" + outputPath + "'");
215 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
216 |     FileOutputFormat.setCompressOutput(job, false);
217 | 
218 |     // Set which InputFormat class to use.
219 |     job.setInputFormat(SequenceFileInputFormat.class);
220 | 
221 |     // Set which OutputFormat class to use.
222 |     job.setOutputFormat(TextOutputFormat.class);
223 | 
224 |     // Set the output data types.
225 |     job.setOutputKeyClass(Text.class);
226 |     job.setOutputValueClass(LongWritable.class);
227 | 
228 |     // Set which Mapper and Reducer classes to use.
229 |     job.setMapperClass(ExampleMetadataDomainPageCount.ExampleMetadataDomainPageCountMapper.class);
230 |     job.setReducerClass(LongSumReducer.class);
231 | 
232 |     if (JobClient.runJob(job).isSuccessful())
233 |       return 0;
234 |     else
235 |       return 1;
236 |   }
237 | 
238 |   /**
239 |    * Main entry point that uses the {@link ToolRunner} class to run the example
240 |    * Hadoop job.
241 |    */
242 |   public static void main(String[] args)
243 |       throws Exception {
244 |     int res = ToolRunner.run(new Configuration(), new ExampleMetadataDomainPageCount(), args);
245 |     System.exit(res);
246 |   }
247 | }
248 | 
249 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/examples/ExampleMetadataStats.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.examples;
  2 | 
  3 | // Java classes
  4 | import java.io.BufferedReader;
  5 | import java.io.DataOutputStream;
  6 | import java.io.File;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.net.URI;
 10 | import java.net.URISyntaxException;
 11 | 
 12 | // Apache Project classes
 13 | import org.apache.log4j.Logger;
 14 | 
 15 | // Hadoop classes
 16 | import org.apache.hadoop.conf.Configured;
 17 | import org.apache.hadoop.conf.Configuration;
 18 | import org.apache.hadoop.fs.FSDataOutputStream;
 19 | import org.apache.hadoop.fs.FileStatus;
 20 | import org.apache.hadoop.fs.FileSystem;
 21 | import org.apache.hadoop.fs.Path;
 22 | import org.apache.hadoop.fs.PathFilter;
 23 | import org.apache.hadoop.io.LongWritable;
 24 | import org.apache.hadoop.io.Text;
 25 | import org.apache.hadoop.mapred.FileInputFormat;
 26 | import org.apache.hadoop.mapred.FileOutputFormat;
 27 | import org.apache.hadoop.mapred.InputSplit;
 28 | import org.apache.hadoop.mapred.JobClient;
 29 | import org.apache.hadoop.mapred.JobConf;
 30 | import org.apache.hadoop.mapred.Mapper;
 31 | import org.apache.hadoop.mapred.MapReduceBase;
 32 | import org.apache.hadoop.mapred.OutputCollector;
 33 | import org.apache.hadoop.mapred.Reporter;
 34 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
 35 | import org.apache.hadoop.mapred.TextOutputFormat;
 36 | import org.apache.hadoop.mapred.lib.LongSumReducer;
 37 | import org.apache.hadoop.util.Progressable;
 38 | import org.apache.hadoop.util.Tool;
 39 | import org.apache.hadoop.util.ToolRunner;
 40 | 
 41 | // Google Gson classes
 42 | import com.google.gson.Gson;
 43 | import com.google.gson.GsonBuilder;
 44 | import com.google.gson.JsonElement;
 45 | import com.google.gson.JsonObject;
 46 | import com.google.gson.JsonParser;
 47 | 
 48 | // Google Guava classes
 49 | import com.google.common.net.InternetDomainName;
 50 | 
 51 | /**
 52 |  * An example showing how to use the Common Crawl 'metadata' files to quickly
 53 |  * gather high level information about the corpus' content.
 54 |  * 
 55 |  * @author Chris Stephens <chris@commoncrawl.org>
 56 |  */
 57 | public class ExampleMetadataStats
 58 |     extends    Configured
 59 |     implements Tool {
 60 | 
 61 |   private static final Logger LOG = Logger.getLogger(ExampleMetadataStats.class);
 62 | 
 63 |   /**
 64 |    * Mapping class that produces statistics about the Common Crawl corpus.
 65 |    */ 
 66 |   public static class ExampleMetadataStatsMapper
 67 |       extends    MapReduceBase
 68 |       implements Mapper<Text, Text, Text, LongWritable> {
 69 | 
 70 |     // create a counter group for Mapper-specific statistics
 71 |     private final String _counterGroup = "Custom Mapper Counters";
 72 | 
 73 |     // implement the main "map" function
 74 |     public void map(Text key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter)
 75 |         throws IOException {
 76 | 
 77 |       // key & value are "Text" right now ...
 78 |       String url   = key.toString();
 79 |       String json  = value.toString();
 80 | 
 81 |       try {
 82 |  
 83 |         // See if the page has a successful HTTP code
 84 |         JsonParser jsonParser = new JsonParser();
 85 |         JsonObject jsonObj    = jsonParser.parse(json).getAsJsonObject();
 86 | 
 87 |         boolean isSuccessful = false;
 88 | 
 89 |         String disposition = "[no status]";
 90 |  
 91 |         if (jsonObj.has("disposition"))
 92 |         {
 93 |           disposition = jsonObj.get("disposition").getAsString().trim().toUpperCase();
 94 | 
 95 |           if (disposition.equals("SUCCESS"))
 96 |             isSuccessful = true;
 97 |         }
 98 | 
 99 |         // Output a basic page count
100 |         output.collect(new Text("Pages Requested\tTotal"), new LongWritable(1));
101 | 
102 |         output.collect(new Text("Pages Requested\t"+disposition), new LongWritable(1));
103 | 
104 |         // Output the HTTP result
105 |         String httpResult = "[missing]";
106 | 
107 |         if (jsonObj.has("http_result"))
108 |           httpResult = jsonObj.get("http_result").getAsString().trim().toUpperCase();
109 | 
110 |         output.collect(new Text("HTTP Code\t"+httpResult+" ("+disposition+")"), new LongWritable(1));
111 | 
112 |         // If the request was not successful, move to the next record
113 |         if (isSuccessful == false)
114 |           return;
115 | 
116 |         // Gather the host name
117 |         try {
118 | 
119 |           URI uri = new URI(url);
120 |           String host = uri.getHost();
121 | 
122 |           if (host == null || host.equals(""))
123 |             throw new URISyntaxException(url, "Unable to gather host or no host found");
124 | 
125 |           // Gather the domain object
126 |           InternetDomainName domainObj = InternetDomainName.from(host);
127 | 
128 |           // Output the TLD
129 |           String publicSuffix = "[none]";
130 | 
131 |           if (domainObj.hasPublicSuffix())
132 |             publicSuffix = domainObj.publicSuffix().name().trim().toLowerCase();
133 | 
134 |           output.collect(new Text("TLD\t"+publicSuffix), new LongWritable(1));
135 | 
136 |           // Output the private domain
137 |           // WARNING - This dramatically increases the size of the output.
138 |           String privateDomain = "[invalid]";
139 | 
140 |           if (domainObj.topPrivateDomain() != null)
141 |             privateDomain = domainObj.topPrivateDomain().name().trim().toLowerCase();
142 | 
143 |           //output.collect(new Text("Domain\t"+privateDomain), new LongWritable(1));
144 |         }
145 |         catch (URISyntaxException ex) {
146 |           output.collect(new Text("TLD\t[invalid URL]"), new LongWritable(1));
147 |           reporter.incrCounter(this._counterGroup, "Invalid URLs", 1);
148 |         }
149 |  
150 |         // Output MIME Type
151 |         String mimeType = "[missing]";
152 | 
153 |         if (jsonObj.has("mime_type"))
154 |           mimeType = jsonObj.get("mime_type").getAsString().trim().toLowerCase();
155 | 
156 |         output.collect(new Text("Type\t"+mimeType), new LongWritable(1));
157 | 
158 |         // Output Charset
159 |         String charset = "[missing]";
160 | 
161 |         if (jsonObj.has("charset_detected"))
162 |           charset = jsonObj.get("charset_detected").getAsString().trim().toUpperCase();
163 | 
164 |         output.collect(new Text("Charset\t"+charset), new LongWritable(1));
165 | 
166 |         // Download Size
167 |         if (jsonObj.has("download_size") == true)
168 |           output.collect(new Text("Content Size\t"), new LongWritable(jsonObj.get("download_size").getAsInt()));
169 |       }
170 |       catch (IOException ex) {
171 |         throw ex;
172 |       }
173 |       catch (Exception ex) {
174 |         LOG.error("Caught Exception", ex); 
175 |         reporter.incrCounter(this._counterGroup, "Exceptions", 1);
176 |       }
177 |     }
178 |   }
179 | 
180 |   /**
181 |    * Implmentation of Tool.run() method, which builds and runs the Hadoop job.
182 |    *
183 |    * @param  args command line parameters, less common Hadoop job parameters stripped
184 |    *              out and interpreted by the Tool class.  
185 |    * @return      0 if the Hadoop job completes successfully, 1 if not. 
186 |    */
187 |   @Override
188 |   public int run(String[] args)
189 |       throws Exception {
190 | 
191 |     String baseInputPath = null;
192 |     String outputPath    = null;
193 | 
194 |     // Read the command line arguments.
195 |     if (args.length < 1)
196 |       throw new IllegalArgumentException("'run()' must be passed an output path.");
197 | 
198 |     outputPath = args[0];
199 | 
200 |     // Creates a new job configuration for this Hadoop job.
201 |     JobConf job = new JobConf(this.getConf());
202 | 
203 |     job.setJarByClass(ExampleMetadataStats.class);
204 | 
205 |     baseInputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment";
206 | 
207 |     FileSystem fs = null;
208 | 
209 |     // If you would like to process all segments, comment this out and
210 |     // uncomment the block of code below
211 |     String inputPath = baseInputPath + "/1341690154994/metadata-00062";
212 | 
213 |     LOG.info("adding input path '" + inputPath + "'");
214 |     FileInputFormat.addInputPath(job, new Path(inputPath));
215 |     /*
216 |     fs = FileSystem.get(new URI("s3n://aws-publicdatasets"), job);
217 | 
218 |     for (FileStatus fileStatus : fs.globStatus(new Path("/common-crawl/parse-output/valid_segments/[0-9]*"))) { 
219 |       String[] parts = fileStatus.getPath().toString().split("/");
220 |       String inputPath = baseInputPath + "/" + parts[parts.length-1] + "/metadata-*";
221 |       LOG.info("adding input path '" + inputPath + "'");
222 |       FileInputFormat.addInputPath(job, new Path(inputPath));
223 |     }
224 |     */
225 | 
226 |     // Delete the output path directory if it already exists.
227 |     LOG.info("clearing the output path at '" + outputPath + "'");
228 | 
229 |     fs = FileSystem.get(new URI(outputPath), job);
230 | 
231 |     if (fs.exists(new Path(outputPath)))
232 |       fs.delete(new Path(outputPath), true);
233 | 
234 |     // Set the path where final output 'part' files will be saved.
235 |     LOG.info("setting output path to '" + outputPath + "'");
236 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
237 |     FileOutputFormat.setCompressOutput(job, false);
238 | 
239 |     // Set which InputFormat class to use.
240 |     job.setInputFormat(SequenceFileInputFormat.class);
241 | 
242 |     // Set which OutputFormat class to use.
243 |     job.setOutputFormat(TextOutputFormat.class);
244 | 
245 |     // Set the output data types.
246 |     job.setOutputKeyClass(Text.class);
247 |     job.setOutputValueClass(LongWritable.class);
248 | 
249 |     // Set which Mapper and Reducer classes to use.
250 |     job.setMapperClass(ExampleMetadataStats.ExampleMetadataStatsMapper.class);
251 |     job.setCombinerClass(LongSumReducer.class);
252 |     job.setReducerClass(LongSumReducer.class);
253 | 
254 |     if (JobClient.runJob(job).isSuccessful())
255 |       return 0;
256 |     else
257 |       return 1;
258 |   }
259 | 
260 |   /**
261 |    * Main entry point that uses the {@link ToolRunner} class to run the example
262 |    * Hadoop job.
263 |    */
264 |   public static void main(String[] args)
265 |       throws Exception {
266 | 
267 |     int res = ToolRunner.run(new Configuration(), new ExampleMetadataStats(), args);
268 |     System.exit(res);
269 |   }
270 | }
271 | 
272 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/examples/ExampleTextWordCount.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.examples;
  2 | 
  3 | // Java classes
  4 | import java.lang.Math;
  5 | import java.io.BufferedReader;
  6 | import java.io.DataOutputStream;
  7 | import java.io.File;
  8 | import java.io.FileReader;
  9 | import java.io.IOException;
 10 | import java.net.URI;
 11 | 
 12 | // Apache Project classes
 13 | import org.apache.log4j.Logger;
 14 | 
 15 | // Hadoop classes
 16 | import org.apache.hadoop.conf.Configured;
 17 | import org.apache.hadoop.conf.Configuration;
 18 | import org.apache.hadoop.fs.FSDataOutputStream;
 19 | import org.apache.hadoop.fs.FileSystem;
 20 | import org.apache.hadoop.fs.Path;
 21 | import org.apache.hadoop.fs.PathFilter;
 22 | import org.apache.hadoop.io.LongWritable;
 23 | import org.apache.hadoop.io.Text;
 24 | import org.apache.hadoop.mapred.FileInputFormat;
 25 | import org.apache.hadoop.mapred.FileOutputFormat;
 26 | import org.apache.hadoop.mapred.InputSplit;
 27 | import org.apache.hadoop.mapred.JobClient;
 28 | import org.apache.hadoop.mapred.JobConf;
 29 | import org.apache.hadoop.mapred.Mapper;
 30 | import org.apache.hadoop.mapred.MapReduceBase;
 31 | import org.apache.hadoop.mapred.OutputCollector;
 32 | import org.apache.hadoop.mapred.Reporter;
 33 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
 34 | import org.apache.hadoop.mapred.TextOutputFormat;
 35 | import org.apache.hadoop.mapred.lib.LongSumReducer;
 36 | import org.apache.hadoop.util.Progressable;
 37 | import org.apache.hadoop.util.Tool;
 38 | import org.apache.hadoop.util.ToolRunner;
 39 | 
 40 | /**
 41 |  * An example showing how to use the Common Crawl 'textData' files to efficiently
 42 |  * work with Common Crawl corpus text content.
 43 |  * 
 44 |  * @author Chris Stephens <chris@commoncrawl.org>
 45 |  */
 46 | public class ExampleTextWordCount extends Configured implements Tool {
 47 | 
 48 |   private static final Logger LOG = Logger.getLogger(ExampleTextWordCount.class);
 49 | 
 50 |   /**
 51 |    * Perform a simple word count mapping on text data from the Common Crawl corpus.
 52 |    */
 53 |   public static class ExampleTextWordCountMapper
 54 |       extends    MapReduceBase 
 55 |       implements Mapper<Text, Text, Text, LongWritable> {
 56 | 
 57 |     // create a counter group for Mapper-specific statistics
 58 |     private final String _counterGroup = "Custom Mapper Counters";
 59 | 
 60 |     public void map(Text key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter)
 61 |         throws IOException {
 62 | 
 63 |       reporter.incrCounter(this._counterGroup, "Records In", 1);
 64 | 
 65 |       try {
 66 | 
 67 |         // Get the text content as a string.
 68 |         String pageText = value.toString();
 69 | 
 70 |         // Removes all punctuation.
 71 |         pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", "");
 72 | 
 73 |         // Normalizes whitespace to single spaces.
 74 |         pageText = pageText.replaceAll("\\s+", " ");
 75 | 
 76 |         if (pageText == null || pageText == "") {
 77 |           reporter.incrCounter(this._counterGroup, "Skipped - Empty Page Text", 1);
 78 |         }
 79 | 
 80 |         // Splits by space and outputs to OutputCollector.
 81 |         for (String word : pageText.split(" ")) {
 82 |           output.collect(new Text(word.toLowerCase()), new LongWritable(1));
 83 |         }
 84 |       }
 85 |       catch (Exception ex) {
 86 |         LOG.error("Caught Exception", ex);
 87 |         reporter.incrCounter(this._counterGroup, "Exceptions", 1);
 88 |       }
 89 |     }
 90 |   }
 91 | 
 92 |   /**
 93 |    * Hadoop FileSystem PathFilter for ARC files, allowing users to limit the
 94 |    * number of files processed.
 95 |    *
 96 |    * @author Chris Stephens <chris@commoncrawl.org>
 97 |    */
 98 |   public static class SampleFilter
 99 |       implements PathFilter {
100 | 
101 |     private static int count =         0;
102 |     private static int max   = 999999999;
103 | 
104 |     public boolean accept(Path path) {
105 | 
106 |       if (!path.getName().startsWith("textData-"))
107 |         return false;
108 | 
109 |       SampleFilter.count++;
110 | 
111 |       if (SampleFilter.count > SampleFilter.max)
112 |         return false;
113 | 
114 |       return true;
115 |     }
116 |   }
117 | 
118 |   /**
119 |    * Implmentation of Tool.run() method, which builds and runs the Hadoop job.
120 |    *
121 |    * @param  args command line parameters, less common Hadoop job parameters stripped
122 |    *              out and interpreted by the Tool class.  
123 |    * @return      0 if the Hadoop job completes successfully, 1 if not. 
124 |    */
125 |   @Override
126 |   public int run(String[] args)
127 |       throws Exception {
128 | 
129 |     String outputPath = null;
130 |     String configFile = null;
131 | 
132 |     // Read the command line arguments.
133 |     if (args.length <  1)
134 |       throw new IllegalArgumentException("Example JAR must be passed an output path.");
135 | 
136 |     outputPath = args[0];
137 | 
138 |     if (args.length >= 2)
139 |       configFile = args[1];
140 | 
141 |     // For this example, only look at a single text file.
142 |     String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/1341690166822/textData-01666";
143 |  
144 |     // Switch to this if you'd like to look at all text files.  May take many minutes just to read the file listing.
145 |   //String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/textData-*";
146 | 
147 |     // Read in any additional config parameters.
148 |     if (configFile != null) {
149 |       LOG.info("adding config parameters from '"+ configFile + "'");
150 |       this.getConf().addResource(configFile);
151 |     }
152 | 
153 |     // Creates a new job configuration for this Hadoop job.
154 |     JobConf job = new JobConf(this.getConf());
155 | 
156 |     job.setJarByClass(ExampleTextWordCount.class);
157 | 
158 |     // Scan the provided input path for ARC files.
159 |     LOG.info("setting input path to '"+ inputPath + "'");
160 |     FileInputFormat.addInputPath(job, new Path(inputPath));
161 |     FileInputFormat.setInputPathFilter(job, SampleFilter.class);
162 | 
163 |     // Delete the output path directory if it already exists.
164 |     LOG.info("clearing the output path at '" + outputPath + "'");
165 | 
166 |     FileSystem fs = FileSystem.get(new URI(outputPath), job);
167 | 
168 |     if (fs.exists(new Path(outputPath)))
169 |       fs.delete(new Path(outputPath), true);
170 | 
171 |     // Set the path where final output 'part' files will be saved.
172 |     LOG.info("setting output path to '" + outputPath + "'");
173 |     FileOutputFormat.setOutputPath(job, new Path(outputPath));
174 |     FileOutputFormat.setCompressOutput(job, false);
175 | 
176 |     // Set which InputFormat class to use.
177 |     job.setInputFormat(SequenceFileInputFormat.class);
178 | 
179 |     // Set which OutputFormat class to use.
180 |     job.setOutputFormat(TextOutputFormat.class);
181 | 
182 |     // Set the output data types.
183 |     job.setOutputKeyClass(Text.class);
184 |     job.setOutputValueClass(LongWritable.class);
185 | 
186 |     // Set which Mapper and Reducer classes to use.
187 |     job.setMapperClass(ExampleTextWordCount.ExampleTextWordCountMapper.class);
188 |     job.setReducerClass(LongSumReducer.class);
189 | 
190 |     if (JobClient.runJob(job).isSuccessful())
191 |       return 0;
192 |     else
193 |       return 1;
194 |   }
195 | 
196 |   /**
197 |    * Main entry point that uses the {@link ToolRunner} class to run the example
198 |    * Hadoop job.
199 |    */
200 |   public static void main(String[] args)
201 |       throws Exception {
202 |     int res = ToolRunner.run(new Configuration(), new ExampleTextWordCount(), args);
203 |     System.exit(res);
204 |   }
205 | }
206 | 
207 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.commoncrawl.hadoop.mapred;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapred.FileInputFormat;
 9 | import org.apache.hadoop.mapred.FileSplit;
10 | import org.apache.hadoop.mapred.InputSplit;
11 | import org.apache.hadoop.mapred.JobConf;
12 | import org.apache.hadoop.mapred.RecordReader;
13 | import org.apache.hadoop.mapred.Reporter;
14 | 
15 | /**
16 |  * A input format the reads arc files.
17 |  */
18 | public class ArcInputFormat
19 |   extends FileInputFormat<Text, ArcRecord> {
20 | 
21 |   /**
22 |    * Returns the <code>RecordReader</code> for reading the arc file.
23 |    * 
24 |    * @param split The InputSplit of the arc file to process.
25 |    * @param job The job configuration.
26 |    * @param reporter The progress reporter.
27 |    */
28 |   public RecordReader<Text, ArcRecord> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
29 |       throws IOException {
30 |     reporter.setStatus(split.toString());
31 |     return new ArcRecordReader(job, (FileSplit)split);
32 |   }
33 | 
34 |   /**
35 |    * <p>Always returns false to indicate that ARC files are not splittable.</p>
36 |    * <p>ARC files are stored in 100MB files, meaning they will be stored in at
37 |    * most 3 blocks (2 blocks on Hadoop systems with 128MB block size).</p>
38 |    */
39 |   protected boolean isSplitable(FileSystem fs, Path filename) {
40 |     return false;
41 |   }
42 | }
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.hadoop.mapred;
  2 | 
  3 | import java.io.ByteArrayInputStream;
  4 | import java.io.DataInput;
  5 | import java.io.DataOutput;
  6 | import java.io.EOFException;
  7 | import java.io.InputStream;
  8 | import java.io.IOException;
  9 | import java.lang.IllegalArgumentException;
 10 | import java.lang.Integer;
 11 | import java.lang.Math;
 12 | import java.lang.NumberFormatException;
 13 | import java.text.ParseException;
 14 | import java.text.SimpleDateFormat;
 15 | import java.util.Arrays;
 16 | import java.util.Date;
 17 | 
 18 | // Hadoop classes
 19 | import org.apache.hadoop.io.Writable;
 20 | 
 21 | // Apache log4j classes
 22 | import org.apache.log4j.Logger;
 23 | 
 24 | // Apache HTTP Components classes
 25 | import org.apache.http.Header;
 26 | import org.apache.http.HeaderElement;
 27 | import org.apache.http.HttpException;
 28 | import org.apache.http.HttpResponse;
 29 | import org.apache.http.StatusLine;
 30 | import org.apache.http.entity.InputStreamEntity;
 31 | import org.apache.http.entity.ContentType;
 32 | import org.apache.http.impl.DefaultHttpResponseFactory;
 33 | import org.apache.http.impl.io.AbstractSessionInputBuffer;
 34 | import org.apache.http.impl.io.DefaultHttpResponseParser;
 35 | import org.apache.http.io.SessionInputBuffer;
 36 | import org.apache.http.message.BasicHeader;
 37 | import org.apache.http.message.BasicHeaderValueParser;
 38 | import org.apache.http.message.BasicLineParser;
 39 | import org.apache.http.params.BasicHttpParams;
 40 | 
 41 | // Jsoup classes
 42 | import org.jsoup.Jsoup;
 43 | import org.jsoup.nodes.Document;
 44 | 
 45 | /**
 46 |  * An entry in an ARC (Internet Archive) data file.
 47 |  *
 48 |  * @author Chris Stephens
 49 |  */
 50 | public class ArcRecord
 51 |     implements Writable {
 52 | 
 53 |   private static final Logger LOG = Logger.getLogger(ArcRecord.class);
 54 | 
 55 |   // ARC v1 metadata
 56 |   private String _url;
 57 |   private String _ipAddress;
 58 |   private Date   _archiveDate;
 59 |   private String _contentType;
 60 |   private int    _contentLength;
 61 | 
 62 |   // ARC v2 metadata
 63 | //private int    _resultCode;
 64 | //private String _checksum;
 65 | //private String _location;
 66 | //private long   _offset;
 67 | //private String _filename;
 68 | 
 69 |   private byte[] _payload;
 70 | 
 71 |   private HttpResponse _httpResponse;
 72 | 
 73 |   private int _httpContentStart;
 74 | 
 75 |   /**
 76 |    * <p>Creates an empty ARC record.</p>
 77 |    */
 78 |   public ArcRecord() { }
 79 | 
 80 |   private void _clear() {
 81 |     this._url = null;
 82 |     this._ipAddress = null;
 83 |     this._archiveDate = null;
 84 |     this._contentType = null;
 85 |     this._contentLength = 0;
 86 |     this._payload = null;
 87 |     this._httpResponse = null;
 88 |   }
 89 | 
 90 |   private String _readLine(InputStream in)
 91 |       throws IOException, EOFException {
 92 | 
 93 |     StringBuffer line = new StringBuffer(128);
 94 | 
 95 |     // read a line of content
 96 |     int b = in.read();
 97 |     int n = 1;
 98 | 
 99 |     // if -1 is returned, we are at EOF
100 |     if (b == -1)
101 |       throw new EOFException();
102 | 
103 |     // read until an NL
104 |     do {
105 | 
106 |       if (((char) b) == '\n')
107 |         break;
108 | 
109 |       line.append((char) b);
110 | 
111 |       b = in.read();
112 |       n++;
113 |     }
114 |     while (b != -1);
115 | 
116 |     return line.toString();
117 |   }
118 | 
119 |   /**
120 |    * <p>Parses the ARC record header and payload (content) from a stream.</p>
121 |    *
122 |    * @return TRUE if the ARC record was parsed and loaded successfully, FALSE if not.
123 |    */
124 |   public boolean readFrom(InputStream in)
125 |       throws IOException, EOFException {
126 | 
127 |     if (in == null) {
128 |       LOG.error("ArcRecord cannot be created from NULL/missing input stream.");
129 |       return false;
130 |     }
131 | 
132 |     // Clear any current values assigned to the object.
133 |     this._clear();
134 | 
135 |     // Read the ARC header from the stream.
136 |     String arcRecordHeader = this._readLine(in);
137 | 
138 |     try {
139 |       this.setArcRecordHeader(arcRecordHeader);
140 |       this.setPayload(in);
141 |     }
142 |     catch (IOException ex) {
143 |       throw ex;
144 |     }
145 |     catch (Exception ex) {
146 |       LOG.error("Exception thrown while parsing ARC record", ex);
147 |       return false;
148 |     }
149 |      
150 |     return true;
151 |   }
152 | 
153 |   /**
154 |    * <p>Parses and sets the ARC record header fields.</p>
155 |    * <p>Currently, this method expects the ARC record header string to contain
156 |    * the following fields, in order, separated by space:
157 |    * <ul>
158 |    * <li>URL</li>
159 |    * <li>IP Address</li>
160 |    * <li>Archive Date</li>
161 |    * <li>Content Type</li>
162 |    * <li>Content Length</li>
163 |    * </ul>
164 |    * </p>
165 |    * <p>For more information on the arc file format, see
166 |    * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
167 |    *
168 |    * @param arcRecordHeader The first line of an ARC file entry - the header
169 |    *                        line for an ARC file item.
170 |    */
171 |   public void setArcRecordHeader(String arcRecordHeader)
172 |       throws IllegalArgumentException, ParseException {
173 | 
174 |     if (arcRecordHeader == null || arcRecordHeader.equals(""))
175 |       throw new IllegalArgumentException("ARC v1 record header string is empty.");
176 | 
177 |     String[] metadata = arcRecordHeader.split(" ");
178 | 
179 |     if (metadata.length != 5) {
180 |       LOG.info(" [ "+arcRecordHeader+" ] ");
181 |       throw new IllegalArgumentException("ARC v1 record header must be 5 fields.");
182 |     }
183 | 
184 |     SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss");
185 | 
186 |     this._url            =  metadata[0];
187 |     this._ipAddress      =  metadata[1];
188 |     this._archiveDate    =  format.parse(metadata[2]);
189 |     this._contentType    =  metadata[3];
190 |     this._contentLength  = (new Integer(metadata[4])).intValue();
191 |   }
192 | 
193 |   /**
194 |    * <p>Reads and sets the ARC record payload from an input stream.</p>
195 |    *
196 |    * @param in An input stream positioned at the start of the ARC record payload.
197 |    */
198 |   public void setPayload(InputStream in)
199 |       throws IllegalArgumentException, ParseException, IOException {
200 | 
201 |     if (in == null)
202 |       throw new IllegalArgumentException("ArcRecord cannot be created from NULL/missing input stream.");
203 | 
204 |     int bufferSize = this._contentLength;
205 | 
206 |     this._payload = new byte[bufferSize];
207 | 
208 |     int n = in.read(this._payload, 0, this._payload.length);
209 | 
210 |     if (n < this._payload.length) {
211 |       LOG.warn("Expecting "+bufferSize+" bytes in ARC record payload, found "+n+" bytes.  Performing array copy.");
212 |       this._payload = Arrays.copyOf(this._payload, n);
213 |     }
214 | 
215 |     // After this, we should be at the end of this GZIP member.  Let the
216 |     // calling function verify the position of the stream.
217 |   }
218 | 
219 |   public void addToPayload(byte[] data) {
220 |     this.addToPayload(data, data.length);
221 |   }
222 | 
223 |   public void addToPayload(byte[] data, int length) {
224 | 
225 |     LOG.warn("Content Length must have been incorrect - someone needed to add more data to the payload.");
226 | 
227 |     if (this._payload == null) {
228 |       this._payload = Arrays.copyOf(data, length);
229 |     }
230 |     else {
231 |       int i = this._payload.length;
232 |       int n = this._payload.length + length;
233 | 
234 |       // resize the payload buffer
235 |       this._payload = Arrays.copyOf(this._payload, n);
236 | 
237 |       // copy in the additional data
238 |       System.arraycopy(data, 0, this._payload, i, length);
239 |     }
240 |   }
241 | 
242 |   /**
243 |    * {@inheritDoc}
244 |    */
245 |   public String toString() {
246 |     return this._url + " - " + this._archiveDate.toString() + " - " + this._contentType;
247 |   }
248 | 
249 |   /**
250 |    * {@inheritDoc}
251 |    */
252 |   public void write(DataOutput out)
253 |       throws IOException {
254 | 
255 |     // write out ARC header info
256 |     out.writeUTF(this._url);
257 |     out.writeUTF(this._ipAddress);
258 |     out.writeUTF(this._contentType);
259 |     out.writeLong(this._archiveDate.getTime());
260 |     out.writeInt(this._contentLength);
261 | 
262 |     // write out the payload
263 |     out.writeInt(this._payload.length);
264 |     out.write(this._payload, 0, this._payload.length);
265 |   }
266 | 
267 |   /**
268 |    * {@inheritDoc}
269 |    */
270 |   public void readFields(DataInput in)
271 |       throws IOException {
272 | 
273 |     // read in ARC header info
274 |     this._url           = in.readUTF();
275 |     this._ipAddress     = in.readUTF();
276 |     this._contentType   = in.readUTF();
277 |     this._archiveDate   = new Date(in.readLong());
278 |     this._contentLength = in.readInt();
279 | 
280 |     // read in the payload
281 |     int payloadLength = in.readInt();
282 | 
283 |     // resize the payload buffer if necessary
284 |     if (this._payload == null || this._payload.length != payloadLength)
285 |       this._payload = new byte[payloadLength];
286 | 
287 |     try {
288 |       in.readFully(this._payload, 0, payloadLength);
289 |     }
290 |     catch (EOFException ex) {
291 |       throw new IOException("End of input reached before payload was fully deserialized.");
292 |     }
293 | 
294 |     // assume that if a new payload was loaded, HTTP response will need to be reparsed.
295 |     this._httpResponse = null;
296 |   }
297 | 
298 |   /**
299 |    * <p>Returns the full ARC record payload.  This is usually a complete HTTP
300 |    * response.</p>
301 |    *
302 |    * @return The raw ARC record content.
303 |    */
304 |   public byte[] getPayload() {
305 |     return this._payload;
306 |   }
307 | 
308 |   /**
309 |    * <p>Returns the URL from the ARC record header.</p>
310 |    *
311 |    * @return The URL for this entry.
312 |    */
313 |   public String getURL() {
314 |     return this._url;
315 |   }
316 | 
317 |   /**
318 |    * <p>Returns the IP address from the ARC record header.</p>
319 |    *
320 |    * @return The IP address for this entry.
321 |    */
322 |   public String getIpAddress() {
323 |     return this._ipAddress;
324 |   }
325 | 
326 |   /**
327 |    * <p>Returns the archive date from the ARC record header.</p>
328 |    *
329 |    * @return The archive date for this entry.
330 |    */
331 |   public Date getArchiveDate() {
332 |     return this._archiveDate;
333 |   }
334 | 
335 |   /**
336 |    * <p>Returns the MIME content type from the ARC record header.</p>
337 |    * <p>Note: The MIME content type in the ARC record header is not necessarily the
338 |    * same as the <code>Content-Type</code> HTTP header inside the content body 
339 |    * (if one is present).</p>
340 |    *
341 |    * @return The MIME content type for this entry.
342 |    */
343 |   public String getContentType() {
344 |     return this._contentType;
345 |   }
346 | 
347 |   /**
348 |    * <p>Returns the content length from the ARC record header.</p>
349 |    * <p>Note: The content length in the ARC record header is not necessarily the
350 |    * same as the <code>Content-Length</code> HTTP header inside the content body 
351 |    * (if one is present).</p>
352 |    *
353 |    * @return The content length for this entry.
354 |    */
355 |   public int getContentLength() {
356 |     return this._contentLength;
357 |   }
358 | 
359 |   /**
360 |    * <p>Returns the HTTP status code.</p>
361 |    * <p>If the payload could not be parsed as an HTTP response, returns -1.</p>
362 |    * <p>Warning: if the payload has not yet been parsed as an HTTP response,
363 |    * calling this function parses the full response.  Parsing is only performed
364 |    * once - parsed data is retained for subsequent calls.</p>
365 |    *
366 |    * @return The HTTP status code.
367 |    */
368 |   public int getHttpStatusCode()
369 |       throws IOException, HttpException {
370 | 
371 |     HttpResponse httpResponse = this.getHttpResponse();
372 | 
373 |     if (httpResponse == null)
374 |       return -1;
375 | 
376 |     return httpResponse.getStatusLine().getStatusCode();
377 |   }
378 | 
379 |   /**
380 |    * <p>Returns an array of HTTP headers.</p>
381 |    * <p>If the payload could not be parsed as an HTTP response, returns <code>null</code>.</p>
382 |    * <p>Warning: if the payload has not yet been parsed as an HTTP response,
383 |    * calling this function parses the full response.  Parsing is only performed
384 |    * once - parsed data is retained for subsequent calls.</p>
385 |    *
386 |    * @return An array of HTTP headers.
387 |    */
388 |   public Header[] getHttpHeaders()
389 |       throws IOException, HttpException {
390 | 
391 |     HttpResponse httpResponse = this.getHttpResponse();
392 | 
393 |     if (httpResponse == null)
394 |       return null;
395 | 
396 |     return httpResponse.getAllHeaders();
397 |   }
398 | 
399 |   /**
400 |    *
401 |    */ 
402 |   public static class ByteArraySessionInputBuffer
403 |       extends AbstractSessionInputBuffer {
404 | 
405 |     public ByteArraySessionInputBuffer(byte[] buf) {
406 |       BasicHttpParams params = new BasicHttpParams();
407 |       this.init(new ByteArrayInputStream(buf), 4096, params);
408 |     }
409 | 
410 |     public ByteArraySessionInputBuffer(byte[] buf, int offset, int length) {
411 |       BasicHttpParams params = new BasicHttpParams();
412 |       this.init(new ByteArrayInputStream(buf, offset, length), 4096, params);
413 |     }
414 | 
415 |     public boolean isDataAvailable(int timeout) {
416 |       return true;
417 |     }
418 |   }
419 | 
420 |   /**
421 |    * <p>Helper function to search a byte array for CR-LF-CR-LF (the end of
422 |    * HTTP headers in the payload buffer).</p>
423 |    *
424 |    * @return The offset of the end of HTTP headers, after the last CRLF.
425 |    */
426 |   private int _searchForCRLFCRLF(byte[] data) {
427 | 
428 |     final byte CR = (byte)'\r';
429 |     final byte LF = (byte)'\n';
430 | 
431 |     int i;
432 |     int s = 0;
433 | 
434 |     for (i = 0; i < data.length; i++) {
435 | 
436 |       if      (data[i] == CR) {
437 |         if      (s == 0) s = 1;
438 |         else if (s == 1) s = 0;
439 |         else if (s == 2) s = 3;
440 |         else if (s == 3) s = 0;
441 |       }
442 |       else if (data[i] == LF) {
443 |         if      (s == 0) s = 0;
444 |         else if (s == 1) s = 2;
445 |         else if (s == 2) s = 0;
446 |         else if (s == 3) s = 4;
447 |       }
448 |       else {
449 |         s = 0;
450 |       }
451 | 
452 |       if (s == 4)
453 |         return i + 1;
454 |     }
455 | 
456 |     return -1;
457 |   }
458 | 
459 |   /**
460 |    * <p>Returns an HTTP response object parsed from the ARC record payload.<p>
461 |    * <p>Note: The payload is parsed on-demand, but is only parsed once.  The
462 |    * parsed data is saved for subsequent calls.</p>
463 |    *
464 |    * @return The ARC record payload as an HTTP response object.  See the Apache
465 |    * HttpComponents project.
466 |    */
467 |   public HttpResponse getHttpResponse()
468 |       throws IOException, HttpException {
469 | 
470 |     if (this._httpResponse != null)
471 |       return this._httpResponse;
472 | 
473 |     if (this._payload == null) {
474 |       LOG.error("Unable to parse HTTP response: Payload has not been set"); return null;
475 |     }
476 | 
477 |     if (this._url != null && !this._url.startsWith("http://") && !this._url.startsWith("https://")) {
478 |       LOG.error("Unable to parse HTTP response: URL protocol is not HTTP"); return null;
479 |     }
480 | 
481 |     this._httpResponse = null;
482 | 
483 |     // Find where the HTTP headers stop
484 |     int end = this._searchForCRLFCRLF(this._payload);
485 | 
486 |     if (end == -1) {
487 |       LOG.error("Unable to parse HTTP response: End of HTTP headers not found"); return null;
488 |     }
489 | 
490 |     // Parse the HTTP status line and headers
491 |     DefaultHttpResponseParser parser =
492 |       new DefaultHttpResponseParser(
493 |         new ByteArraySessionInputBuffer(this._payload, 0, end),
494 |         new BasicLineParser(),
495 |         new DefaultHttpResponseFactory(),
496 |         new BasicHttpParams()
497 |       );
498 | 
499 |     this._httpResponse = parser.parse();
500 | 
501 |     if (this._httpResponse == null) {
502 |       LOG.error("Unable to parse HTTP response"); return null;
503 |     }      
504 | 
505 |     // Set the reset of the payload as the HTTP entity.  Use an InputStreamEntity
506 |     // to avoid a memory copy.
507 |     InputStreamEntity entity = new InputStreamEntity(new ByteArrayInputStream(this._payload, end, this._payload.length - end), this._payload.length - end);
508 |     entity.setContentType(this._httpResponse.getFirstHeader("Content-Type"));
509 |     entity.setContentEncoding(this._httpResponse.getFirstHeader("Content-Encoding"));
510 |     this._httpResponse.setEntity(entity);
511 | 
512 |     return this._httpResponse;
513 |   }
514 | 
515 |   /**
516 |    * <p>Returns a Jsoup HTML document, parsed using the Charset in the
517 |    * "Content-Type" header.  If the document charset cannot be found, parse is
518 |    * attempted using </p>
519 |    *
520 |    * @return A Jsoup parsed HTML document from the HTTP response content.
521 |    */
522 |   public Document getParsedHTML()
523 |       throws IOException {
524 | 
525 |     if (this._url == null) {
526 |       LOG.error("Unable to parse HTML: URL from ARC header has not been set");
527 |       return null;
528 |     }
529 | 
530 |     // if response has not been parsed yet, this parses it
531 |     try {
532 |       this.getHttpResponse();
533 |     }
534 |     catch (HttpException ex) {
535 |       LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null;
536 |     }
537 | 
538 |     if (this._httpResponse == null) {
539 |       LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null;
540 |     }
541 | 
542 |     if (this._httpResponse.getEntity() == null) {
543 |       LOG.error("Unable to parse HTML: No HTTP response entity found"); return null;
544 |     }
545 | 
546 |     if (!this._contentType.toLowerCase().contains("html")) {
547 |       LOG.warn("Unable to parse HTML: Content is not HTML"); return null;
548 |     }
549 | 
550 |     String charset = null;
551 | 
552 |     try {
553 |       // Default value returned is "text/plain" with charset of ISO-8859-1.
554 |       charset = ContentType.getOrDefault(this._httpResponse.getEntity()).getCharset().name();
555 |     }
556 |     catch (Throwable ex) {
557 | 
558 |     }
559 | 
560 |     // if anything goes wrong, try ISO-8859-1
561 |     if (charset == null)
562 |       charset = "ISO-8859-1";
563 | 
564 |     // parse the content using the derived charset and the URL from the ARC header
565 |     return Jsoup.parse(this._httpResponse.getEntity().getContent(), charset, this._url);
566 |   }
567 | }
568 | 
569 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java:
--------------------------------------------------------------------------------
  1 | package org.commoncrawl.hadoop.mapred;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.EOFException;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | 
  9 | import java.lang.Math;
 10 | import java.lang.StringBuffer;
 11 | import java.util.Arrays;
 12 | 
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.FSDataInputStream;
 15 | import org.apache.hadoop.fs.FileSystem;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.Text;
 18 | import org.apache.hadoop.mapred.FileSplit;
 19 | import org.apache.hadoop.mapred.RecordReader;
 20 | 
 21 | import org.apache.log4j.Logger;
 22 | 
 23 | import org.commoncrawl.compressors.gzip.GzipCompressorInputStream;
 24 | 
 25 | /**
 26 |  * Reads ARC records.
 27 |  * 
 28 |  * Set "io.file.buffer.size" to define the amount of data that should be
 29 |  * buffered from S3.
 30 |  */
 31 | public class ArcRecordReader
 32 |     implements RecordReader<Text, ArcRecord> {
 33 | 
 34 |   private static final Logger LOG = Logger.getLogger(ArcRecordReader.class);
 35 | 
 36 |   private FSDataInputStream         _fsin;
 37 |   private GzipCompressorInputStream _gzip;
 38 |   private long                      _fileLength;
 39 | 
 40 |   /**
 41 |    *
 42 |    */
 43 |   public ArcRecordReader(Configuration job, FileSplit split)
 44 |       throws IOException { 
 45 | 
 46 |     if (split.getStart() != 0) {
 47 |       IOException ex = new IOException("Invalid ARC file split start " + split.getStart() + ": ARC files are not splittable");
 48 |       LOG.error(ex.getMessage());
 49 |       throw ex; 
 50 |     }
 51 | 
 52 |     // open the file and seek to the start of the split
 53 |     final Path file = split.getPath();
 54 | 
 55 |     FileSystem fs = file.getFileSystem(job);
 56 | 
 57 |     this._fsin = fs.open(file);
 58 | 
 59 |     // create a GZIP stream that *does not* automatically read through members
 60 |     this._gzip = new GzipCompressorInputStream(this._fsin, false);
 61 | 
 62 |     this._fileLength = fs.getFileStatus(file).getLen();
 63 | 
 64 |     // First record should be an ARC file header record.  Skip it.
 65 |     this._skipRecord();
 66 |   }
 67 | 
 68 |   /**
 69 |    * Skips the current record, and advances to the next GZIP member.
 70 |    */
 71 |   private void _skipRecord()
 72 |       throws IOException {
 73 | 
 74 |     long n = 0;
 75 | 
 76 |     do {
 77 |       n = this._gzip.skip(999999999);
 78 |     }
 79 |     while (n > 0);
 80 | 
 81 |     this._gzip.nextMember();
 82 |   }
 83 |   
 84 |   /**
 85 |    * @inheritDoc
 86 |    */
 87 |   public Text createKey() {
 88 |     return new Text();
 89 |   }
 90 |   
 91 |   /**
 92 |    * @inheritDoc
 93 |    */
 94 |   public ArcRecord createValue() {
 95 |     return new ArcRecord();
 96 |   }
 97 | 
 98 |   private static byte[] _checkBuffer = new byte[64];
 99 | 
100 |   /**
101 |    * 
102 |    */
103 |   public synchronized boolean next(Text key, ArcRecord value)
104 |       throws IOException {
105 | 
106 |     boolean isValid = true;
107 |     
108 |     // try reading an ARC record from the stream
109 |     try {
110 |       isValid = value.readFrom(this._gzip);
111 |     }
112 |     catch (EOFException ex) {
113 |       return false;
114 |     }
115 | 
116 |     // if the record is not valid, skip it
117 |     if (isValid == false) {
118 |       LOG.error("Invalid ARC record found at GZIP position "+this._gzip.getBytesRead()+".  Skipping ...");
119 |       this._skipRecord();
120 |       return true;
121 |     }
122 | 
123 |     if (value.getURL() != null)
124 |       key.set(value.getURL());
125 | 
126 |     // check to make sure we've reached the end of the GZIP member
127 |     int n = this._gzip.read(_checkBuffer, 0, 64);
128 | 
129 |     if (n != -1) {
130 |       LOG.error(n+"  bytes of unexpected content found at end of ARC record.  Skipping ...");
131 |       this._skipRecord();
132 |     }
133 |     else {
134 |       this._gzip.nextMember();
135 |     }
136 |    
137 |     return true;
138 |   }
139 | 
140 |   /**
141 |    * @inheritDoc
142 |    */
143 |   public float getProgress()
144 |       throws IOException {
145 |     return Math.min(1.0f, this._gzip.getBytesRead() / (float) this._fileLength);
146 |   }
147 |   
148 |   /**
149 |    * @inheritDoc
150 |    */
151 |   public synchronized long getPos()
152 |       throws IOException {
153 |     return this._gzip.getBytesRead();
154 |   }
155 | 
156 |   /**
157 |    * @inheritDoc
158 |    */
159 |   public synchronized void close()
160 |       throws IOException {
161 | 
162 |     if (this._gzip != null)
163 |       this._gzip.close(); 
164 |   }
165 | 
166 | }
167 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.commoncrawl.nutch.tools.arc;
18 | 
19 | import java.io.IOException;
20 | 
21 | import org.apache.hadoop.io.BytesWritable;
22 | import org.apache.hadoop.io.Text;
23 | import org.apache.hadoop.mapred.FileInputFormat;
24 | import org.apache.hadoop.mapred.FileSplit;
25 | import org.apache.hadoop.mapred.InputSplit;
26 | import org.apache.hadoop.mapred.JobConf;
27 | import org.apache.hadoop.mapred.RecordReader;
28 | import org.apache.hadoop.mapred.Reporter;
29 | 
30 | /**
31 |  * A input format the reads arc files.
32 |  */
33 | public class ArcInputFormat
34 |   extends FileInputFormat<Text, BytesWritable> {
35 | 
36 |   /**
37 |    * Returns the <code>RecordReader</code> for reading the arc file.
38 |    * 
39 |    * @param split The InputSplit of the arc file to process.
40 |    * @param job The job configuration.
41 |    * @param reporter The progress reporter.
42 |    */
43 |   public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
44 |       JobConf job, Reporter reporter)
45 |     throws IOException {
46 |     reporter.setStatus(split.toString());
47 |     return new ArcRecordReader(job, (FileSplit)split);
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.commoncrawl.nutch.tools.arc;
 18 | 
 19 | import java.io.ByteArrayOutputStream;
 20 | import java.io.IOException;
 21 | import java.util.zip.GZIPInputStream;
 22 | 
 23 | // - modified by Common Crawl -
 24 | //import org.slf4j.Logger;
 25 | //import org.slf4j.LoggerFactory;
 26 | import org.apache.log4j.Logger;
 27 | 
 28 | import org.apache.hadoop.conf.Configuration;
 29 | import org.apache.hadoop.fs.FSDataInputStream;
 30 | import org.apache.hadoop.fs.FileSystem;
 31 | import org.apache.hadoop.fs.Path;
 32 | import org.apache.hadoop.io.BytesWritable;
 33 | import org.apache.hadoop.io.Text;
 34 | import org.apache.hadoop.mapred.FileSplit;
 35 | import org.apache.hadoop.mapred.RecordReader;
 36 | import org.apache.hadoop.util.ReflectionUtils;
 37 | import org.apache.hadoop.util.StringUtils;
 38 | 
 39 | /**
 40 |  * <p>The <code>ArcRecordReader</code> class provides a record reader which 
 41 |  * reads records from arc files.</p>
 42 |  * 
 43 |  * <p>Arc files are essentially tars of gzips.  Each record in an arc file is
 44 |  * a compressed gzip.  Multiple records are concatenated together to form a
 45 |  * complete arc.  For more information on the arc file format see
 46 |  * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p>
 47 |  * 
 48 |  * <p>Arc files are used by the internet archive and grub projects.</p>
 49 |  * 
 50 |  * see {@link http://www.archive.org/ }
 51 |  * see {@link http://www.grub.org/ }
 52 |  */
 53 | public class ArcRecordReader
 54 |   implements RecordReader<Text, BytesWritable> {
 55 | 
 56 |   private static final Logger LOG = Logger.getLogger(ArcRecordReader.class);
 57 | 
 58 |   protected Configuration conf;
 59 |   protected long splitStart = 0;
 60 |   protected long pos = 0;
 61 |   protected long splitEnd = 0;
 62 |   protected long splitLen = 0;
 63 |   protected long fileLen = 0;
 64 |   protected FSDataInputStream in;
 65 | 
 66 |   private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B};
 67 | 
 68 |   /**
 69 |    * <p>Returns true if the byte array passed matches the gzip header magic 
 70 |    * number.</p>
 71 |    * 
 72 |    * @param input The byte array to check.
 73 |    * 
 74 |    * @return True if the byte array matches the gzip header magic number.
 75 |    */
 76 |   public static boolean isMagic(byte[] input) {
 77 | 
 78 | 	// check for null and incorrect length
 79 |     if (input == null || input.length != MAGIC.length) {
 80 |       return false;
 81 |     }
 82 |     
 83 |     // check byte by byte
 84 |     for (int i = 0; i < MAGIC.length; i++) {
 85 |       if (MAGIC[i] != input[i]) {
 86 |         return false;
 87 |       }
 88 |     }
 89 |     
 90 |     // must match
 91 |     return true;
 92 |   }
 93 | 
 94 |   /**
 95 |    * Constructor that sets the configuration and file split.
 96 |    * 
 97 |    * @param conf The job configuration.
 98 |    * @param split The file split to read from.
 99 |    * 
100 |    * @throws IOException  If an IO error occurs while initializing file split.
101 |    */
102 |   public ArcRecordReader(Configuration conf, FileSplit split)
103 |     throws IOException {
104 | 
105 |     Path path = split.getPath();
106 |     FileSystem fs = path.getFileSystem(conf);
107 |     fileLen = fs.getFileStatus(split.getPath()).getLen();
108 |     this.conf = conf;
109 |     this.in = fs.open(split.getPath());
110 |     this.splitStart = split.getStart();
111 |     this.splitEnd = splitStart + split.getLength();
112 |     this.splitLen = split.getLength();
113 |     in.seek(splitStart);
114 |   }
115 | 
116 |   /**
117 |    * Closes the record reader resources.
118 |    */
119 |   public void close()
120 |     throws IOException {
121 |     this.in.close();
122 |   }
123 | 
124 |   /**
125 |    * Creates a new instance of the <code>Text</code> object for the key.
126 |    */
127 |   public Text createKey() {
128 |     return (Text)ReflectionUtils.newInstance(Text.class, conf);
129 |   }
130 | 
131 |   /**
132 |    * Creates a new instance of the <code>BytesWritable</code> object for the key
133 |    */
134 |   public BytesWritable createValue() {
135 |     return (BytesWritable)ReflectionUtils.newInstance(BytesWritable.class, conf);
136 |   }
137 | 
138 |   /**
139 |    * Returns the current position in the file.
140 |    * 
141 |    * @return The long of the current position in the file.
142 |    */
143 |   public long getPos()
144 |     throws IOException {
145 |     return in.getPos();
146 |   }
147 | 
148 |   /**
149 |    * Returns the percentage of progress in processing the file.  This will be
150 |    * represented as a float from 0 to 1 with 1 being 100% completed.
151 |    * 
152 |    * @return The percentage of progress as a float from 0 to 1.
153 |    */
154 |   public float getProgress()
155 |     throws IOException {
156 | 	  
157 |     // if we haven't even started
158 |     if (splitEnd == splitStart) {
159 |       return 0.0f;
160 |     }
161 |     else {
162 |       // the progress is current pos - where we started  / length of the split
163 |       return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
164 |     }
165 |   }
166 | 
167 |   /**
168 |    * <p>Returns true if the next record in the split is read into the key and 
169 |    * value pair.  The key will be the arc record header and the values will be
170 |    * the raw content bytes of the arc record.</p>
171 |    * 
172 |    * @param key The record key
173 |    * @param value The record value
174 |    * 
175 |    * @return True if the next record is read.
176 |    * 
177 |    * @throws IOException If an error occurs while reading the record value.
178 |    */
179 |   public boolean next(Text key, BytesWritable value)
180 |     throws IOException {
181 | 
182 |     try {
183 |       
184 |       // get the starting position on the input stream
185 |       long startRead = in.getPos();
186 |       byte[] magicBuffer = null;
187 |       
188 |       // we need this loop to handle false positives in reading of gzip records
189 |       while (true) {
190 |         
191 |         // while we haven't passed the end of the split
192 |         if (startRead >= splitEnd) {
193 |           return false;
194 |         }
195 |         
196 |         // scanning for the gzip header
197 |         boolean foundStart = false;
198 |         while (!foundStart) {
199 |           
200 |           // start at the current file position and scan for 1K at time, break
201 |           // if there is no more to read
202 |           startRead = in.getPos();
203 |           magicBuffer = new byte[1024];
204 |           int read = in.read(magicBuffer);
205 |           if (read < 0) {
206 |             break;
207 |           }
208 |           
209 |           // scan the byte array for the gzip header magic number.  This happens
210 |           // byte by byte
211 |           for (int i = 0; i < read - 1; i++) {
212 |             byte[] testMagic = new byte[2];
213 |             System.arraycopy(magicBuffer, i, testMagic, 0, 2);            
214 |             if (isMagic(testMagic)) {              
215 |               // set the next start to the current gzip header
216 |               startRead += i;
217 |               foundStart = true;
218 |               break;
219 |             }
220 |           }
221 |         }
222 |         
223 |         // seek to the start of the gzip header
224 |         in.seek(startRead);
225 |         ByteArrayOutputStream baos = null;
226 |         int totalRead = 0;
227 | 
228 |         try {
229 |           
230 |           // read 4K of the gzip at a time putting into a byte array
231 |           byte[] buffer = new byte[4096];
232 |           GZIPInputStream zin = new GZIPInputStream(in);
233 |           int gzipRead = -1;
234 |           baos = new ByteArrayOutputStream();
235 |           while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) {
236 |             baos.write(buffer, 0, gzipRead);
237 |             totalRead += gzipRead;
238 |           }
239 |         }
240 |         catch (Exception e) {
241 |           
242 |           // there are times we get false positives where the gzip header exists
243 |           // but it is not an actual gzip record, so we ignore it and start
244 |           // over seeking
245 |           // LOG.debug("Ignoring position: " + (startRead));
246 |           if (startRead + 1 < fileLen) {
247 |             in.seek(startRead + 1);
248 |           }
249 |           continue;
250 |         }
251 | 
252 |         // change the output stream to a byte array
253 |         byte[] content = baos.toByteArray();
254 |         
255 |         // the first line of the raw content in arc files is the header
256 |         int eol = 0;
257 |         for (int i = 0; i < content.length; i++) {
258 |           if (i > 0 && content[i] == '\n') {
259 |             eol = i;
260 |             break;
261 |           }
262 |         }
263 |         
264 |         // create the header and the raw content minus the header
265 |         String header = new String(content, 0, eol).trim();
266 |         byte[] raw = new byte[(content.length - eol) - 1];
267 |         System.arraycopy(content, eol + 1, raw, 0, raw.length);
268 |         
269 |         // populate key and values with the header and raw content.
270 |         Text keyText = (Text)key;
271 |         keyText.set(header);
272 |         BytesWritable valueBytes = (BytesWritable)value;
273 |         valueBytes.set(raw, 0, raw.length);
274 | 
275 |         // TODO: It would be best to start at the end of the gzip read but 
276 |         // the bytes read in gzip don't match raw bytes in the file so we 
277 |         // overshoot the next header.  With this current method you get
278 |         // some false positives but don't miss records.
279 |         if (startRead + 1 < fileLen) {
280 |           in.seek(startRead + 1);
281 |         }
282 |         
283 |         // populated the record, now return
284 |         return true;
285 |       }
286 |     }
287 |     catch (Exception e) {
288 |       LOG.equals(StringUtils.stringifyException(e));      
289 |     }
290 |     
291 |     // couldn't populate the record or there is no next record to read
292 |     return false;
293 |   }
294 | }
295 | 


--------------------------------------------------------------------------------
/src/ruby/ExampleArcParseMap.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'rubygems'
 4 | require 'open3'
 5 | require 'uri'
 6 | 
 7 | # Inline these classes so we don't have to copy a file while bootstrapping
 8 | class ArcRecord
 9 |   attr_accessor :num, :url, :ip_address, :archive_date, :content_type, :content_length, :content
10 | end
11 | 
12 | class ArcFile
13 | 
14 |   include Enumerable
15 | 
16 |   def initialize(input_stream)
17 |     @handle=input_stream
18 |   end
19 | 
20 |   def each
21 |     return self.to_enum unless block_given?
22 |     begin
23 |       # See http://www.archive.org/web/researcher/ArcFileFormat.php
24 |       # for information about the ARC format once it is decompressed
25 |       file_header = @handle.readline.strip
26 |       @handle.read(Integer(file_header.split.last))
27 |       i=1
28 | 
29 |       loop do
30 |         begin
31 |           fields = @handle.readline.strip.split(" ")
32 |           raise "Invalid ARC record header found"       if fields.length != 5
33 |           warn("Invalid protocol in ARC record header") if not fields[0].to_s.start_with?("http://", "https://")
34 | 
35 |           record = ArcRecord.new
36 |           record.num            = i
37 |           record.url            = fields[0].to_s
38 |           record.ip_address     = fields[1].to_s
39 |           record.archive_date   = fields[2].to_s
40 |           record.content_type   = fields[3].to_s
41 |           record.content_length = Integer(fields[4])
42 |           record.content = @handle.read(record.content_length)
43 |           i = i+1
44 | 
45 |           yield record
46 | 
47 |         rescue EOFError
48 |           break nil
49 |         end
50 |       end
51 |     #rescue
52 |     #  raise "#{self.class}: Record ##{i} - Error - #{$!}"
53 |     end
54 |   end
55 | 
56 | end
57 | 
58 | CHUNKSIZE=1024*1024
59 | 
60 | # All warnings will end up in the EMR stderr logs.
61 | warn("Starting up GZIP process, piping #{CHUNKSIZE/1024}KB chunks at a time")
62 | 
63 | # Ruby GzipReader is unable to unzip these files, but unix gunzip can
64 | # Also means we don't need to eat much RAM, because everything is streaming.
65 | Open3.popen3('gunzip -c') {|sin,sout,serr,thr|
66 | 
67 |   # Create an ArcFile instance which will receive gunzip's stdout
68 |   arcfile = ArcFile.new(sout)
69 | 
70 |   Thread.new do
71 |     loop do
72 |       begin
73 |         chunk = STDIN.readpartial(CHUNKSIZE)
74 |         sin.write(chunk)
75 |         Thread.pass()
76 |       rescue EOFError
77 |         warn("End of input, flushing and closing stream to GZIP")
78 |         sin.close() # which will send an EOF to the ArcFile
79 |         break nil
80 |       end
81 |     end
82 |   end
83 | 
84 |   # Now we have a lazy ArcFile that we can treat as an Enumerable.
85 |   arcfile.each {|record|
86 |     if record
87 |       begin
88 |         # work around Ruby URI library's lack of support for URLs with underscore
89 |         uri = URI.parse(record.url.delete("_"))
90 |         STDOUT.puts(uri.host.downcase())
91 |       rescue URI::InvalidURIError
92 |         warn("ARC file contains invalid URL: "+record.url)
93 |         next
94 |       end
95 |     end
96 |   }
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/src/ruby/ExampleArcParseReduce.rb:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env ruby
 2 | 
 3 | curr = nil
 4 | sum  = 0
 5 | 
 6 | ARGF.each do |line|
 7 | 
 8 |   # the entire line is the key
 9 |   key = line.chomp
10 | 
11 |   # if the current key hasn't been set yet, set it
12 |   if !curr
13 | 
14 |     curr = key
15 |     sum  = 0
16 | 
17 |   # if a new key is found, emit the current key ...
18 |   elsif key != curr && sum > 0
19 | 
20 |     if sum > 2 
21 |       STDOUT.puts(curr + "\t" + sum.to_s())
22 |     end
23 | 
24 |     # ... then set up a new key
25 |     curr = key
26 |     sum  = 0
27 | 
28 |   end
29 | 
30 |   # add to count for this current key
31 |   sum += 1
32 | 
33 | end
34 | 


--------------------------------------------------------------------------------
/src/ruby/README:
--------------------------------------------------------------------------------
 1 | common_crawl_types
 2 | 
 3 | Ben Nagy wrote the original code for this project, and posted it inline to the
 4 | Common Crawl mailing list. I tidied it up and wrote a how-to guide:
 5 | http://petewarden.typepad.com/searchbrowser/2012/03/twelve-steps-to-running-your-ruby-code-across-five-billion-web-pages.html
 6 | 
 7 | Ben's original message is below.
 8 | 
 9 | Pete Warden, pete@petewarden.com
10 | 
11 | -------------------------------------------------------------------
12 | 
13 | Hi,
14 | 
15 | So I found this a bit of a pain, so I thought I'd share. If you want
16 | to mess with the Common Crawl stuff but don't feel like learning Java,
17 | this might be for you.
18 | 
19 | I'm sure that this could be easily adapted for other streaming
20 | languages, once you work out how to read requester-pays buckets.
21 | 
22 | First up, see this:
23 | http://arfon.org/getting-started-with-elastic-mapreduce-and-hadoop-streaming
24 | 
25 | Which has basic information and nice screenshots about EMR Streaming,
26 | setting up the job, bootstrapping and such.
27 | 
28 | To install the AWS Ruby SDK on an EMR instance you'll need to
29 | bootstrap some stuff. Some of the packages might not be necessary, but
30 | it was a bit of a pain to trim down from a working set of basic
31 | packages.
32 | 
33 | (see setup.sh)
34 | 
35 | OK, now we're ready for the mapper. This example just collects
36 | mimetypes and URL extensions. The key bits are the ArcFile class and
37 | the monkeypatch to make requester-pays work. I'm not particularly
38 | proud of this monkeypatch, by the way, but the SDK code is a bit
39 | baffling, and it looked like too much work to patch it properly.
40 | 
41 | This mapper expects a file manifest as input, one arc.gz url to read
42 | per line. By doing this you avoid the problem of weird splits, or
43 | having hadoop automatically trying to gunzip the file and failing. It
44 | should look like:
45 | 
46 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380159663_9.arc.gz
47 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380179515_9.arc.gz
48 | s3://commoncrawl-crawl-002/2010/09/24/9/1285380199363_9.arc.gz
49 | 
50 | You can get those names with the SDK, once you add the monkeypatch
51 | below, or with a patched version of s3cmd ls, the instructions for
52 | which have been posted here before.
53 | 
54 | (see extension_map.rb)
55 | 
56 | And finally, a trivial reducer
57 | 
58 | (see extension_reduce.rb)
59 | 
60 | IMHO you only need one of these puppies, which you can achieve by
61 | adding '-D mapred.reduce.tasks=1' to your job args
62 | 
63 | If it all worked you should get something like this in your output
64 | directory:
65 | 
66 | text/html                               : 4365
67 | text/html                .html          : 4256
68 | text/xml                                : 43
69 | text/html                .aspx          : 16
70 | text/html                .com           : 2
71 | text/plain               .txt           : 1
72 | 
73 | Except with more entries, that is just an example based on one file.
74 | 
75 | For those interested in costs / timings, I finished 2010/9/24/9 (790
76 | files) in 5h57m, or 30 normalised instance hours of m1.small, with 1
77 | master and 4 core instances. The same job with 1 m1.small master and
78 | 2x cc1.4xlarge core was done in 1h31m, for 66 normalised instance
79 | hours. I'll let you do your individual maths and avoid drawing any
80 | conclusions. If anyone has additional (solid) performance data
81 | comparing various cluster configs for identical workloads then that
82 | might be useful. As an aside, my map tasks took from 9 minutes to 45
83 | minutes to complete, but the average was probably ~33 (eyeball).
84 | 
85 | Anyway, hope this helps someone.
86 | 
87 | Cheers,
88 | 
89 | ben
90 | 


--------------------------------------------------------------------------------
/test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java:
--------------------------------------------------------------------------------
 1 | package org.commoncrawl.hadoop.mapred;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.InputStream;
 5 | import java.io.IOException;
 6 | import java.lang.StringBuilder;
 7 | 
 8 | import junit.framework.TestCase;
 9 | import static junit.framework.Assert.*;
10 | 
11 | /**
12 |  * Unit Tests for jUnit 3.8
13 |  */
14 | public class TestArcRecordCC extends TestCase {
15 | 
16 |   ArcRecordCC r;
17 | 
18 |   /*
19 |   public static junit.framework.Test suite() {
20 |     return new junit.framework.JUnit4TestAdapter(TestArcRecordCC.class);
21 |   }
22 |   */
23 | 
24 |   public InputStream getPayload1()
25 |       throws Exception {
26 | 
27 |     StringBuilder s = new StringBuilder();
28 | 
29 |     s.setLength(0);
30 |     s.append("<html>\n");
31 |     s.append("  <head>\n");
32 |     s.append("    <title>This is a web page!</title>\n");
33 |     s.append("  </head>\n");
34 |     s.append("  <body>\n");
35 |     s.append("    <h1>This is some content!</h1>\n");
36 |     s.append("  </body>\n");
37 |     s.append("</html>");
38 | 
39 |     String content = s.toString();
40 | 
41 |     s.setLength(0);
42 |     s.append("HTTP/1.1 200 OK\r\n");
43 |     s.append("Date: Fri, 31 Dec 1999 23:59:59 GMT\r\n");
44 |     s.append("Content-Type: text/html; charset=utf-8\r\n");
45 |     s.append("\r\n");
46 |     s.append(content);
47 |     s.insert(0, "http://www.example.com/path/file.php?param=123,456%20789 123.123.123.123 20120235131415 text/html "+(s.length()-3)+"\n"); 
48 | 
49 |     return new ByteArrayInputStream(s.toString().getBytes("UTF-8"));
50 |   }    
51 | 
52 |   public void setUp() {
53 |     r = new ArcRecordCC();
54 |   }
55 | 
56 |   public void test_getIpAddress()
57 |       throws Exception {
58 |     r.readFrom(this.getPayload1());
59 |     assertEquals(r.getIpAddress(), "123.123.123.123");
60 |   }
61 | 
62 |   public void test_getHttpHeaders()
63 |       throws Exception {
64 |     r.readFrom(this.getPayload1());
65 |     assertEquals(r.getHttpResponse().getFirstHeader("Content-Type").getValue(), "text/html; charset=utf-8");
66 |   }
67 | 
68 |   public void test_getHttpResponse_getEntity()
69 |       throws Exception {
70 | 
71 |     r.readFrom(this.getPayload1());
72 |     assertNotNull(r.getHttpResponse().getEntity());
73 | 
74 |     byte[] buffer = new byte[1000];
75 |     r.getHttpResponse().getEntity().getContent().read(buffer, 0, 1000);
76 | 
77 |     StringBuilder s = new StringBuilder();
78 |     s.append("<html>\n");
79 |     s.append("  <head>\n");
80 |     s.append("    <title>This is a web page!</title>\n");
81 |     s.append("  </head>\n");
82 |     s.append("  <body>\n");
83 |     s.append("    <h1>This is some content!</h1>\n");
84 |     s.append("  </body>\n");
85 |     s.append("</html>");
86 | 
87 |     String v1 = s.toString();
88 |     String v2 = new String(buffer, "UTF-8");
89 | 
90 |     assertEquals(v1.trim(), v2.trim());
91 |   }
92 | 
93 |   public void test_getParsedHTML()
94 |       throws Exception {
95 |     r.readFrom(this.getPayload1());
96 |     assertNotNull(r.getParsedHTML());
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------