├── .gitignore ├── LICENSE ├── README.md ├── cc-emr-notebook ├── aws_emr_notebook.ipynb ├── bootstrapscript.sh ├── cluster_setup.md └── dataframe_analysis.ipynb ├── cc-index-table ├── bulk-url-lookups-by-table-joins.ipynb ├── cc-main-2013-2019-metrics.ipynb ├── correlation-language-charset.ipynb ├── data │ ├── cc-main-2013-2019-data-set-counts.csv │ ├── cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year-merged.csv │ ├── cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year.csv │ ├── cc-main-2013-2019-edu-tld-uniq-urls-hll.csv │ ├── cc-main-2020-05-language-charset-correlation.csv │ ├── net-block-ir-cc-main-2019-47-languages.csv │ └── net-block-ir-cc-main-2019-47-tlds.csv └── net-blocking-iran-cc-main-2019-47.ipynb ├── cc-webgraph-statistics ├── README.md ├── comparison_domain_ranks.ipynb ├── icctld.py ├── interactive_webgraph.md └── topology_stats.ipynb ├── warc-truncation ├── README.md ├── cc-main-2018-43-single-warc-file.ipynb ├── cc-main-2019-35-100-warc-files.ipynb ├── cc-main-2019-47-truncation-by-mime-type.ipynb ├── cc-main-2025-truncation-stats.ipynb └── data │ ├── CC-MAIN-2019-35-warc.paths │ ├── truncated-records-CC-MAIN-20181015080248-20181015101748-00033.csv │ ├── truncated-records-CC-MAIN-2019-35-warc-100.csv │ ├── warc-record-size-truncation-by-mime-type-CC-MAIN-2019-47.csv │ ├── warc-record-size-truncation-by-mime-type-CC-MAIN-2025-05.csv │ ├── warc-record-size-truncation-by-mime-type-CC-MAIN-2025-08.csv │ ├── warc-record-size-truncation-by-mime-type-CC-MAIN-2025-13.csv │ ├── warc-truncation-domains-CC-MAIN-2025-05.csv │ └── warc-truncation-domains-detailed-CC-MAIN-2025-05.csv └── webgraph_config.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/README.md -------------------------------------------------------------------------------- /cc-emr-notebook/aws_emr_notebook.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-emr-notebook/aws_emr_notebook.ipynb -------------------------------------------------------------------------------- /cc-emr-notebook/bootstrapscript.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-emr-notebook/bootstrapscript.sh -------------------------------------------------------------------------------- /cc-emr-notebook/cluster_setup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-emr-notebook/cluster_setup.md -------------------------------------------------------------------------------- /cc-emr-notebook/dataframe_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-emr-notebook/dataframe_analysis.ipynb -------------------------------------------------------------------------------- /cc-index-table/bulk-url-lookups-by-table-joins.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/bulk-url-lookups-by-table-joins.ipynb -------------------------------------------------------------------------------- /cc-index-table/cc-main-2013-2019-metrics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/cc-main-2013-2019-metrics.ipynb -------------------------------------------------------------------------------- /cc-index-table/correlation-language-charset.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/correlation-language-charset.ipynb -------------------------------------------------------------------------------- /cc-index-table/data/cc-main-2013-2019-data-set-counts.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/cc-main-2013-2019-data-set-counts.csv -------------------------------------------------------------------------------- /cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year-merged.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year-merged.csv -------------------------------------------------------------------------------- /cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll-by-year.csv -------------------------------------------------------------------------------- /cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/cc-main-2013-2019-edu-tld-uniq-urls-hll.csv -------------------------------------------------------------------------------- /cc-index-table/data/cc-main-2020-05-language-charset-correlation.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/cc-main-2020-05-language-charset-correlation.csv -------------------------------------------------------------------------------- /cc-index-table/data/net-block-ir-cc-main-2019-47-languages.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/net-block-ir-cc-main-2019-47-languages.csv -------------------------------------------------------------------------------- /cc-index-table/data/net-block-ir-cc-main-2019-47-tlds.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/data/net-block-ir-cc-main-2019-47-tlds.csv -------------------------------------------------------------------------------- /cc-index-table/net-blocking-iran-cc-main-2019-47.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-index-table/net-blocking-iran-cc-main-2019-47.ipynb -------------------------------------------------------------------------------- /cc-webgraph-statistics/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-webgraph-statistics/README.md -------------------------------------------------------------------------------- /cc-webgraph-statistics/comparison_domain_ranks.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-webgraph-statistics/comparison_domain_ranks.ipynb -------------------------------------------------------------------------------- /cc-webgraph-statistics/icctld.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-webgraph-statistics/icctld.py -------------------------------------------------------------------------------- /cc-webgraph-statistics/interactive_webgraph.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-webgraph-statistics/interactive_webgraph.md -------------------------------------------------------------------------------- /cc-webgraph-statistics/topology_stats.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/cc-webgraph-statistics/topology_stats.ipynb -------------------------------------------------------------------------------- /warc-truncation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/README.md -------------------------------------------------------------------------------- /warc-truncation/cc-main-2018-43-single-warc-file.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/cc-main-2018-43-single-warc-file.ipynb -------------------------------------------------------------------------------- /warc-truncation/cc-main-2019-35-100-warc-files.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/cc-main-2019-35-100-warc-files.ipynb -------------------------------------------------------------------------------- /warc-truncation/cc-main-2019-47-truncation-by-mime-type.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/cc-main-2019-47-truncation-by-mime-type.ipynb -------------------------------------------------------------------------------- /warc-truncation/cc-main-2025-truncation-stats.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/cc-main-2025-truncation-stats.ipynb -------------------------------------------------------------------------------- /warc-truncation/data/CC-MAIN-2019-35-warc.paths: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/CC-MAIN-2019-35-warc.paths -------------------------------------------------------------------------------- /warc-truncation/data/truncated-records-CC-MAIN-20181015080248-20181015101748-00033.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/truncated-records-CC-MAIN-20181015080248-20181015101748-00033.csv -------------------------------------------------------------------------------- /warc-truncation/data/truncated-records-CC-MAIN-2019-35-warc-100.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/truncated-records-CC-MAIN-2019-35-warc-100.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2019-47.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2019-47.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-05.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-05.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-08.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-08.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-13.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-record-size-truncation-by-mime-type-CC-MAIN-2025-13.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-truncation-domains-CC-MAIN-2025-05.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-truncation-domains-CC-MAIN-2025-05.csv -------------------------------------------------------------------------------- /warc-truncation/data/warc-truncation-domains-detailed-CC-MAIN-2025-05.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/warc-truncation/data/warc-truncation-domains-detailed-CC-MAIN-2025-05.csv -------------------------------------------------------------------------------- /webgraph_config.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/cc-notebooks/HEAD/webgraph_config.sh --------------------------------------------------------------------------------