├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src └── org └── commoncrawl ├── examples ├── S3ReaderTest.java ├── WARCReaderTest.java └── mapreduce │ ├── ServerTypeMap.java │ ├── TagCounterMap.java │ ├── WARCTagCounter.java │ ├── WATSampleOutLinks.java │ ├── WATServerType.java │ ├── WETWordCount.java │ └── WordCounterMap.java └── warc ├── WARCFileInputFormat.java └── WARCFileRecordReader.java /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/README.md -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/pom.xml -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/S3ReaderTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/S3ReaderTest.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/WARCReaderTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/WARCReaderTest.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/TagCounterMap.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/WARCTagCounter.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/WARCTagCounter.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/WATSampleOutLinks.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/WATSampleOutLinks.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/WATServerType.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/WATServerType.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/WETWordCount.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/WETWordCount.java -------------------------------------------------------------------------------- /src/org/commoncrawl/examples/mapreduce/WordCounterMap.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java -------------------------------------------------------------------------------- /src/org/commoncrawl/warc/WARCFileInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/warc/WARCFileInputFormat.java -------------------------------------------------------------------------------- /src/org/commoncrawl/warc/WARCFileRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-warc-examples/HEAD/src/org/commoncrawl/warc/WARCFileRecordReader.java --------------------------------------------------------------------------------