├── .github └── workflows │ └── build.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── commoncrawl │ │ ├── net │ │ ├── HostName.java │ │ └── WarcUri.java │ │ └── spark │ │ ├── CCIndex2Table.java │ │ ├── EOTIndexTable.java │ │ ├── IndexTable.java │ │ ├── JobStatsListener.java │ │ ├── examples │ │ ├── CCIndexExport.java │ │ └── CCIndexWarcExport.java │ │ └── util │ │ ├── CCWarcFilenameParser.java │ │ ├── NullOutputCommitter.java │ │ ├── WarcFileOutputFormat.java │ │ └── WarcRecordWriter.java └── resources │ ├── schema │ ├── cc-index-schema-flat.json │ ├── cc-index-schema-nested.json │ ├── eot-index-schema.json │ ├── index-schema-simple-nested.json │ └── index-schema-simple.json │ └── simplelogger.properties ├── script └── convert_url_index.sh ├── sql ├── athena │ ├── cc-index-create-table-flat.sql │ └── cc-index-create-table-nested.sql └── examples │ └── cc-index │ ├── average-warc-record-length-by-mime-type.sql │ ├── compare-mime-type-http-vs-detected.sql │ ├── correlation-language-charset.sql │ ├── count-by-partition.sql │ ├── count-by-tld-page-host-domain.sql │ ├── count-domains-alexa-top-1m.sql │ ├── count-domains-of-tld.sql │ ├── count-fetch-status.sql │ ├── count-hostname-elements.sql │ ├── count-idna.sql │ ├── count-language-tld.sql │ ├── count-robotstxt-url-paths.sql │ ├── count-url-path-elements.sql │ ├── get-home-pages-languages.sql │ ├── get-language-translations-url-path.sql │ ├── get-records-for-language.sql │ ├── get-records-home-pages.sql │ ├── get-records-of-domain.sql │ ├── get-records-of-mime-type.sql │ ├── get-records-robotstxt.sql │ ├── host-length-distrib.sql │ ├── loglikelihood-language-tld.sql │ ├── random-sample-extract.sql │ ├── random-sample-urls.sql │ ├── similar-domains.sql │ └── site-discovery-by-language.sql └── test └── java └── org └── commoncrawl ├── net └── TestURL.java └── spark ├── TestCCIndex2Table.java ├── TestCCWarcFilenameParser.java ├── TestEOTIndexTable.java ├── TestIndexTable.java └── TestIndexTableBase.java /.github/workflows/build.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/.github/workflows/build.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/.gitignore -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/README.md -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/pom.xml -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/net/HostName.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/net/HostName.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/net/WarcUri.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/net/WarcUri.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/CCIndex2Table.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/CCIndex2Table.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/EOTIndexTable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/EOTIndexTable.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/IndexTable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/IndexTable.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/JobStatsListener.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/JobStatsListener.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/examples/CCIndexExport.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/examples/CCIndexExport.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/examples/CCIndexWarcExport.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/examples/CCIndexWarcExport.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/util/CCWarcFilenameParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/util/CCWarcFilenameParser.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/util/WarcFileOutputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/util/WarcFileOutputFormat.java -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/spark/util/WarcRecordWriter.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/java/org/commoncrawl/spark/util/WarcRecordWriter.java -------------------------------------------------------------------------------- /src/main/resources/schema/cc-index-schema-flat.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/schema/cc-index-schema-flat.json -------------------------------------------------------------------------------- /src/main/resources/schema/cc-index-schema-nested.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/schema/cc-index-schema-nested.json -------------------------------------------------------------------------------- /src/main/resources/schema/eot-index-schema.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/schema/eot-index-schema.json -------------------------------------------------------------------------------- /src/main/resources/schema/index-schema-simple-nested.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/schema/index-schema-simple-nested.json -------------------------------------------------------------------------------- /src/main/resources/schema/index-schema-simple.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/schema/index-schema-simple.json -------------------------------------------------------------------------------- /src/main/resources/simplelogger.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/main/resources/simplelogger.properties -------------------------------------------------------------------------------- /src/script/convert_url_index.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/script/convert_url_index.sh -------------------------------------------------------------------------------- /src/sql/athena/cc-index-create-table-flat.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/athena/cc-index-create-table-flat.sql -------------------------------------------------------------------------------- /src/sql/athena/cc-index-create-table-nested.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/athena/cc-index-create-table-nested.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/compare-mime-type-http-vs-detected.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/compare-mime-type-http-vs-detected.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/correlation-language-charset.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/correlation-language-charset.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-by-partition.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-by-partition.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-by-tld-page-host-domain.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-by-tld-page-host-domain.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-domains-alexa-top-1m.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-domains-alexa-top-1m.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-domains-of-tld.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-domains-of-tld.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-fetch-status.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-fetch-status.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-hostname-elements.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-hostname-elements.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-idna.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-idna.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-language-tld.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-language-tld.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-robotstxt-url-paths.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-robotstxt-url-paths.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/count-url-path-elements.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/count-url-path-elements.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-home-pages-languages.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-home-pages-languages.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-language-translations-url-path.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-language-translations-url-path.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-records-for-language.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-records-for-language.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-records-home-pages.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-records-home-pages.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-records-of-domain.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-records-of-domain.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-records-of-mime-type.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-records-of-mime-type.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/get-records-robotstxt.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/get-records-robotstxt.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/host-length-distrib.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/host-length-distrib.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/loglikelihood-language-tld.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/loglikelihood-language-tld.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/random-sample-extract.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/random-sample-extract.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/random-sample-urls.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/random-sample-urls.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/similar-domains.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/similar-domains.sql -------------------------------------------------------------------------------- /src/sql/examples/cc-index/site-discovery-by-language.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/sql/examples/cc-index/site-discovery-by-language.sql -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/net/TestURL.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/net/TestURL.java -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/spark/TestCCIndex2Table.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/spark/TestCCIndex2Table.java -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/spark/TestCCWarcFilenameParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/spark/TestCCWarcFilenameParser.java -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/spark/TestEOTIndexTable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/spark/TestEOTIndexTable.java -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/spark/TestIndexTable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/spark/TestIndexTable.java -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/spark/TestIndexTableBase.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-index-table/HEAD/src/test/java/org/commoncrawl/spark/TestIndexTableBase.java --------------------------------------------------------------------------------