├── .gitignore ├── CommonCrawlContest ├── .gitignore ├── README-Amazon-AMI ├── VERSION ├── bin │ ├── ccCopyToHDFS │ ├── ccListInvalidSegments │ └── ccRunExample ├── build.properties ├── build.xml ├── conf │ └── mapred.xml ├── src │ └── java │ │ └── org │ │ └── commoncrawl │ │ ├── compressors │ │ ├── CompressorInputStream.java │ │ └── gzip │ │ │ └── GzipCompressorInputStream.java │ │ ├── examples │ │ ├── ExampleWikiLinkCount.java │ │ └── ExampleWikiLinkCount.java~ │ │ ├── hadoop │ │ └── mapred │ │ │ ├── ArcInputFormat.java │ │ │ ├── ArcRecord.java │ │ │ └── ArcRecordReader.java │ │ └── nutch │ │ └── tools │ │ └── arc │ │ ├── ArcInputFormat.java │ │ └── ArcRecordReader.java └── test │ └── java │ └── org │ └── commoncrawl │ └── hadoop │ └── mapred │ └── TestArcRecordCC.java └── README.md /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/.gitignore -------------------------------------------------------------------------------- /CommonCrawlContest/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/.gitignore -------------------------------------------------------------------------------- /CommonCrawlContest/README-Amazon-AMI: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/README-Amazon-AMI -------------------------------------------------------------------------------- /CommonCrawlContest/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.1 2 | -------------------------------------------------------------------------------- /CommonCrawlContest/bin/ccCopyToHDFS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/bin/ccCopyToHDFS -------------------------------------------------------------------------------- /CommonCrawlContest/bin/ccListInvalidSegments: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/bin/ccListInvalidSegments -------------------------------------------------------------------------------- /CommonCrawlContest/bin/ccRunExample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/bin/ccRunExample -------------------------------------------------------------------------------- /CommonCrawlContest/build.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/build.properties -------------------------------------------------------------------------------- /CommonCrawlContest/build.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/build.xml -------------------------------------------------------------------------------- /CommonCrawlContest/conf/mapred.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/conf/mapred.xml -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/compressors/CompressorInputStream.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/compressors/CompressorInputStream.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/compressors/gzip/GzipCompressorInputStream.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/examples/ExampleWikiLinkCount.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/examples/ExampleWikiLinkCount.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/examples/ExampleWikiLinkCount.java~: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/examples/ExampleWikiLinkCount.java~ -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcInputFormat.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcRecord.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/hadoop/mapred/ArcRecordReader.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/nutch/tools/arc/ArcInputFormat.java -------------------------------------------------------------------------------- /CommonCrawlContest/src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/src/java/org/commoncrawl/nutch/tools/arc/ArcRecordReader.java -------------------------------------------------------------------------------- /CommonCrawlContest/test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/CommonCrawlContest/test/java/org/commoncrawl/hadoop/mapred/TestArcRecordCC.java -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoganghan/wikientities/HEAD/README.md --------------------------------------------------------------------------------