├── .gitattributes ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.md ├── alembic.ini.template ├── alembic ├── README ├── env.py └── script.py.mako ├── cleaning ├── __init__.py ├── dedupe_from_indexes.py ├── filter_from_reddit_scores.py ├── generate_minhashes.py ├── minhash_lsh_batching.py └── minhash_lsh_dedupe.py ├── data_analysis ├── __init__.py └── final_stats.py ├── mkdocs ├── docs │ ├── background.md │ ├── css │ │ └── extra.css │ ├── index.md │ ├── licence.md │ └── replication.md └── mkdocs.yml ├── pushshift ├── __init__.py ├── download_pushshift_dumps.py ├── generate_urls.py ├── models.py ├── process_dump_files_sqlite.py └── pushshift_to_sqlite.py ├── requirements.txt ├── scraping ├── __init__.py ├── filter.py ├── scrape_urls.py └── scrapers.py └── utils ├── __init__.py ├── archive_stream_readers.py ├── archiver.py ├── logger.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/.gitignore -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/.readthedocs.yml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/README.md -------------------------------------------------------------------------------- /alembic.ini.template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/alembic.ini.template -------------------------------------------------------------------------------- /alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /alembic/env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/alembic/env.py -------------------------------------------------------------------------------- /alembic/script.py.mako: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/alembic/script.py.mako -------------------------------------------------------------------------------- /cleaning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaning/dedupe_from_indexes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/cleaning/dedupe_from_indexes.py -------------------------------------------------------------------------------- /cleaning/filter_from_reddit_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/cleaning/filter_from_reddit_scores.py -------------------------------------------------------------------------------- /cleaning/generate_minhashes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/cleaning/generate_minhashes.py -------------------------------------------------------------------------------- /cleaning/minhash_lsh_batching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/cleaning/minhash_lsh_batching.py -------------------------------------------------------------------------------- /cleaning/minhash_lsh_dedupe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/cleaning/minhash_lsh_dedupe.py -------------------------------------------------------------------------------- /data_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_analysis/final_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/data_analysis/final_stats.py -------------------------------------------------------------------------------- /mkdocs/docs/background.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/docs/background.md -------------------------------------------------------------------------------- /mkdocs/docs/css/extra.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/docs/css/extra.css -------------------------------------------------------------------------------- /mkdocs/docs/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/docs/index.md -------------------------------------------------------------------------------- /mkdocs/docs/licence.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/docs/licence.md -------------------------------------------------------------------------------- /mkdocs/docs/replication.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/docs/replication.md -------------------------------------------------------------------------------- /mkdocs/mkdocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/mkdocs/mkdocs.yml -------------------------------------------------------------------------------- /pushshift/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pushshift/download_pushshift_dumps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/pushshift/download_pushshift_dumps.py -------------------------------------------------------------------------------- /pushshift/generate_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/pushshift/generate_urls.py -------------------------------------------------------------------------------- /pushshift/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/pushshift/models.py -------------------------------------------------------------------------------- /pushshift/process_dump_files_sqlite.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/pushshift/process_dump_files_sqlite.py -------------------------------------------------------------------------------- /pushshift/pushshift_to_sqlite.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/pushshift/pushshift_to_sqlite.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/requirements.txt -------------------------------------------------------------------------------- /scraping/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scraping/filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/scraping/filter.py -------------------------------------------------------------------------------- /scraping/scrape_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/scraping/scrape_urls.py -------------------------------------------------------------------------------- /scraping/scrapers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/scraping/scrapers.py -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/archive_stream_readers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/utils/archive_stream_readers.py -------------------------------------------------------------------------------- /utils/archiver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/utils/archiver.py -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/utils/logger.py -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/openwebtext2/HEAD/utils/utils.py --------------------------------------------------------------------------------