├── .gitignore ├── README-Amazon-AMI ├── VERSION ├── bin ├── ccCopyToHDFS ├── ccListInvalidSegments └── ccRunExample ├── conf └── mapred.xml ├── lib ├── gson-2.2.1.jar ├── guava-12.0.jar ├── httpcore-4.2.1.jar └── jsoup-1.6.3.jar ├── src ├── java │ └── org │ │ └── commoncrawl │ │ ├── compressors │ │ ├── CompressorInputStream.java │ │ └── gzip │ │ │ └── GzipCompressorInputStream.java │ │ ├── examples │ │ ├── ExampleArcMicroformat.java │ │ ├── ExampleMetadataDomainPageCount.java │ │ ├── ExampleMetadataStats.java │ │ └── ExampleTextWordCount.java │ │ ├── hadoop │ │ └── mapred │ │ │ ├── ArcInputFormat.java │ │ │ ├── ArcRecord.java │ │ │ └── ArcRecordReader.java │ │ └── nutch │ │ └── tools │ │ └── arc │ │ ├── ArcInputFormat.java │ │ └── ArcRecordReader.java └── ruby │ ├── ExampleArcParseMap.rb │ ├── ExampleArcParseReduce.rb │ └── README └── test └── java └── org └── commoncrawl └── hadoop └── mapred └── TestArcRecordCC.java /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/.gitignore -------------------------------------------------------------------------------- /README-Amazon-AMI: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/README-Amazon-AMI -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.0.1 2 | -------------------------------------------------------------------------------- /bin/ccCopyToHDFS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/bin/ccCopyToHDFS -------------------------------------------------------------------------------- /bin/ccListInvalidSegments: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/bin/ccListInvalidSegments -------------------------------------------------------------------------------- /bin/ccRunExample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/bin/ccRunExample -------------------------------------------------------------------------------- /conf/mapred.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/conf/mapred.xml -------------------------------------------------------------------------------- /lib/gson-2.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/lib/gson-2.2.1.jar -------------------------------------------------------------------------------- /lib/guava-12.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/lib/guava-12.0.jar -------------------------------------------------------------------------------- /lib/httpcore-4.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/lib/httpcore-4.2.1.jar -------------------------------------------------------------------------------- /lib/jsoup-1.6.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/lib/jsoup-1.6.3.jar -------------------------------------------------------------------------------- /src/java/org/commoncrawl/compressors/CompressorInputStream.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/compressors/CompressorInputStream.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleArcMicroformat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/examples/ExampleArcMicroformat.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/examples/ExampleMetadataDomainPageCount.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleMetadataStats.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/examples/ExampleMetadataStats.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/examples/ExampleTextWordCount.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/examples/ExampleTextWordCount.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java -------------------------------------------------------------------------------- /src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseMap.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/ruby/ExampleArcParseMap.rb -------------------------------------------------------------------------------- /src/ruby/ExampleArcParseReduce.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/ruby/ExampleArcParseReduce.rb -------------------------------------------------------------------------------- /src/ruby/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/src/ruby/README -------------------------------------------------------------------------------- /test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/commoncrawl-examples/HEAD/test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java --------------------------------------------------------------------------------