├── .gitignore ├── .gitlab-ci.yml ├── CONTRIBUTORS ├── Dockerfile.sandcrawler-pytest ├── README.md ├── TODO ├── extra ├── RUNBOOK.md ├── blobs │ ├── README.md │ ├── minio │ │ ├── README.md │ │ └── minio.conf │ ├── seaweedfs │ │ └── README.md │ └── tasks.md ├── docker │ ├── README.md │ └── docker-compose.yml ├── hbase │ ├── howto.md │ ├── notes.txt │ └── schema_design.md └── nginx │ ├── README.md │ ├── fatcat-blobs │ ├── sandcrawler-db │ └── sandcrawler-minio ├── fetch_hadoop.sh ├── kafka ├── debugging_issues.txt ├── grobid_kafka_notes.txt ├── howto_rebalance.md ├── monitoring_commands.md └── topics.md ├── match_test_data ├── NOTES.txt ├── RESULTS.txt ├── crossref_sample.bibjson ├── grobid_sample.bibjson └── math_universe_releases.json ├── notes ├── backfill_scalding_rewrite.txt ├── crawl_cdx_merge.md ├── dryad_datasets.md ├── examples │ ├── 2021-11-12_broken_grobid_xml.md │ ├── dataset_examples.txt │ ├── html_test_journals.txt │ └── random_datasets.md ├── fuzzy_match_notes.md ├── grobid_munging.txt ├── hadoop_job_log.md ├── hbase_table_sizes.txt ├── html_ingest_notes.md ├── ingest │ ├── .gitignore │ ├── 2019-10-23_testing.md │ ├── 2020-01-14_bulk.md │ ├── 2020-02-04_ingest_backfills.md │ ├── 2020-02-18_ingest_backfills.md │ ├── 2020-02-21_ingest_backfills.md │ ├── 2020-02-22_fixed_domain.txt │ ├── 2020-02_unpaywall.md │ ├── 2020-03-02_ingests.txt │ ├── 2020-03-oa_but_not_marked.md │ ├── 2020-03_mag.md │ ├── 2020-03_s2.md │ ├── 2020-04-13_covid19.md │ ├── 2020-04_datacite.md │ ├── 2020-04_unpaywall.md │ ├── 2020-05_oai_pmh.md │ ├── 2020-05_pubmed.md │ ├── 2020-07_mag.md │ ├── 2020-08_daily_improvements.md │ ├── 2020-09_oa_doi.md │ ├── 2020-09_reingest.md │ ├── 2020-09_scielo.md │ ├── 2020-10_daily.md │ ├── 2020-10_unpaywall.md │ ├── 2020-11-04_arxiv.md │ ├── 2020-11_doaj.md │ ├── 2020-12-08_patch_crawl_notes.md │ ├── 2021-04_unpaywall.md │ ├── 2021-05_daily_improvements.md │ ├── 2021-07_unpaywall.md │ ├── 2021-08_mag.md │ ├── 2021-09-02_oai_pmh_patch.md │ ├── 2021-09-03_daily_improvements.md │ ├── 2021-09-03_patch_crawl.md │ ├── 2021-12-13_datasets.md │ ├── 2022-01-06_patch_crawl.md │ ├── 2022-01-13_doi_crawl.md │ ├── 2022-03_doaj.md │ ├── 2022-03_oaipmh.md │ ├── 2022-04_targeted.md │ ├── 2022-04_unpaywall.md │ ├── 2022-07-15_ingest_fixes.md │ ├── 2022-07-19_dblp.md │ ├── 2022-07_doaj.md │ ├── 2022-07_targeted.md │ ├── 2022-09_oaipmh.md │ ├── 2023-06_oaipmh.md │ ├── 2023-10_dimensions.md │ ├── 2023-12_oaipmh.md │ ├── NEXT.md │ └── es_csv_to_json.py ├── ingest_domains.txt ├── library_shopping.txt ├── match_filter_enrich.txt ├── old_extract_results.txt ├── petabox_ia_metadata.txt ├── possible_ingest_targets.txt ├── sandcrawler_worker_failures.md ├── tasks │ ├── 2020-01-06_heuristic_cdx.txt │ ├── 2020-01-27_cleanup_cdx.md │ ├── 2020-01-27_grobid_backfill.md │ ├── 2020-02-14_pdftrio.md │ ├── 2020-07-22_processing_holes.md │ ├── 2020-08-20_file_meta.md │ ├── 2020-10-21_pdfextract_holes.md │ ├── 2021-09-09_pdf_url_lists.md │ ├── 2021-10-29_crossref_refs_backfill.md │ ├── 2021-12-06_regrobid.md │ ├── 2022-01-07_grobid_platform_pdfs.md │ ├── 2022-03-07_ukraine_firedrill.md │ ├── 2022-04-27_pdf_url_lists.md │ └── 2022-11-21_andrzejklimczuk_cleanup.md ├── url_pattern_heuristic_backfill.txt └── url_pattern_heuristic_verification.txt ├── pig ├── .gitignore ├── Pipfile ├── Pipfile.lock ├── README.md ├── filter-cdx-join-urls.pig ├── filter-cdx-paper-pdfs.pig ├── filter-cdx-pdfs.pig ├── filter-cdx-ps.pig ├── filter-cdx-source-code-crude.pig ├── filter-cdx-tarball.pig ├── hbase-count-rows.pig ├── join-cdx-sha1.pig ├── pytest.ini └── tests │ ├── files │ ├── example.cdx │ ├── example.sha1b32 │ ├── papers_domain_words.cdx │ ├── papers_edu_tilde.cdx │ ├── papers_url_doi.cdx │ ├── papers_url_words.cdx │ ├── sourcecode.cdx │ └── tarballs.cdx │ ├── log4j.properties │ ├── pig.properties │ ├── pighelper.py │ ├── test_filter_cdx.py │ ├── test_filter_cdx_paper_pdfs.py │ ├── test_filter_software.py │ └── test_join_cdx.py ├── please ├── proposals ├── 2018_original_sandcrawler_rfc.md ├── 2019_ingest.md ├── 2019_pdftotext_pdfinfo.md ├── 20200129_pdf_ingest.md ├── 20200207_pdftrio.md ├── 20200211_nsq.md ├── 20201012_no_capture.md ├── 20201026_html_ingest.md ├── 20201103_xml_ingest.md ├── 2020_pdf_meta_thumbnails.md ├── 2020_seaweed_s3.md ├── 2021-04-22_crossref_db.md ├── 2021-09-09_component_ingest.md ├── 2021-09-09_fileset_ingest.md ├── 2021-09-13_src_ingest.md ├── 2021-09-21_spn_accounts.md ├── 2021-10-28_grobid_refs.md ├── 2021-12-09_trawling.md ├── brainstorm │ ├── 2021-debug_web_interface.md │ └── 2022-04-18_automated_heritrix_crawling.md └── schema_changes.sql ├── python ├── .coveragerc ├── .flake8 ├── .gitignore ├── .pylintrc ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.md ├── example.env ├── grobid_tool.py ├── ia_pdf_match.py ├── ingest_tool.py ├── pdfextract_tool.py ├── pdftrio_tool.py ├── persist_tool.py ├── pyproject.toml ├── pytest.ini ├── sandcrawler │ ├── __init__.py │ ├── db.py │ ├── fileset_platforms.py │ ├── fileset_strategies.py │ ├── fileset_types.py │ ├── grobid.py │ ├── html.py │ ├── html_metadata.py │ ├── ia.py │ ├── ingest_file.py │ ├── ingest_fileset.py │ ├── ingest_html.py │ ├── minio.py │ ├── misc.py │ ├── pdfextract.py │ ├── pdftrio.py │ ├── persist.py │ ├── workers.py │ └── xml.py ├── sandcrawler_worker.py ├── scripts │ ├── arabesque2ingestrequest.py │ ├── archiveorg_fileset.py │ ├── cdx_collection.py │ ├── covid2ingestrequest.py │ ├── deliver_dumpgrobid_to_s3.py │ ├── deliver_gwb_to_disk.py │ ├── deliver_gwb_to_s3.py │ ├── doaj2ingestrequest.py │ ├── enrich_scored_matches.py │ ├── fetch_cdx_sha1hex.py │ ├── filter_grobid_metadata.py │ ├── filter_groupworks.py │ ├── filter_scored_matches.py │ ├── grobid_affiliations.py │ ├── import_grobid_metadata.py │ ├── ingestrequest_row2json.py │ ├── manifest_converter.py │ ├── oai2ingestrequest.py │ ├── pdf_thumbnail.py │ └── unpaywall2ingestrequest.py ├── tests │ ├── files │ │ ├── 23b29ea36382680716be08fc71aa81bd226e8a85.xml │ │ ├── crossref_api_work_978-3-030-64953-1_4.json │ │ ├── crossref_api_work_s1047951103000064.json │ │ ├── dlib_05vanhyning.html │ │ ├── dummy.pdf │ │ ├── dummy_zip.zip │ │ ├── elife_article.html │ │ ├── example.cdx │ │ ├── example_grobid_metadata.json │ │ ├── first_monday_ojs3_fulltext.html │ │ ├── first_monday_ojs3_landingpage.html │ │ ├── genders_g58_fairlie.html │ │ ├── grobid_refs_978-3-030-64953-1_4.tei.xml │ │ ├── grobid_refs_s1047951103000064.tei.xml │ │ ├── nature_article.html │ │ ├── peerj_oa_article.html │ │ ├── plos_one_article.html │ │ ├── plos_one_article_no_icon_href.html │ │ ├── scielo_article.jats.xml │ │ ├── small.json │ │ └── small.xml │ ├── test_grobid.py │ ├── test_grobid2json.py │ ├── test_html.py │ ├── test_html_ingest.py │ ├── test_html_metadata.py │ ├── test_ingest.py │ ├── test_ingest_html.py │ ├── test_live_wayback.py │ ├── test_misc.py │ ├── test_pdfextract.py │ ├── test_pushers.py │ ├── test_savepagenow.py │ ├── test_wayback.py │ └── test_xml.py └── title_slug_denylist.txt ├── python_hadoop ├── Pipfile ├── Pipfile.lock ├── README.md ├── backfill_hbase_from_cdx.py ├── common.py ├── extraction_cdx_grobid.py ├── extraction_ungrobided.py ├── grobid2json.py ├── kafka_grobid_hbase.py ├── mrjob.conf └── tests │ ├── files │ ├── 23b29ea36382680716be08fc71aa81bd226e8a85.xml │ ├── example.cdx │ ├── example_grobid_metadata.json │ ├── example_ungrobided.tsv │ ├── small.json │ └── small.xml │ ├── test_backfill_hbase_from_cdx.py │ ├── test_common.py │ ├── test_extraction_cdx_grobid.py │ ├── test_extraction_ungrobided.py │ └── test_grobid2json.py ├── scalding ├── .gitignore ├── README.md ├── build.sbt ├── ia_cluster.conf ├── project │ ├── Dependencies.scala │ ├── build.properties │ └── plugins.sbt ├── scalastyle-config.xml ├── scalding-background.md ├── scalding-debugging.md └── src │ ├── main │ ├── resources │ │ └── slug-denylist.txt │ └── scala │ │ ├── example │ │ ├── SimpleHBaseSourceExample.scala │ │ └── WordCountJob.scala │ │ └── sandcrawler │ │ ├── BibjsonScorable.scala │ │ ├── CdxBackfillJob.scala │ │ ├── CrossrefScorable.scala │ │ ├── DumpFileMetaJob.scala │ │ ├── DumpGrobidMetaInsertableJob.scala │ │ ├── DumpGrobidStatusCodeJob.scala │ │ ├── DumpGrobidXmlJob.scala │ │ ├── DumpUnGrobidedJob.scala │ │ ├── FatcatScorable.scala │ │ ├── GrobidScorable.scala │ │ ├── GrobidScorableDumpJob.scala │ │ ├── GroupFatcatWorksJob.scala │ │ ├── GroupFatcatWorksSubsetJob.scala │ │ ├── HBaseBuilder.scala │ │ ├── HBaseColCountJob.scala │ │ ├── HBaseCountJob.scala │ │ ├── HBaseMimeCountJob.scala │ │ ├── HBaseRowCountJob.scala │ │ ├── HBaseStatusCodeCountJob.scala │ │ ├── HBaseStatusCountJob.scala │ │ ├── MatchBenchmarkJob.scala │ │ ├── MissingColumnDumpJob.scala │ │ ├── Scorable.scala │ │ ├── ScorableFeatures.scala │ │ ├── ScoreInsertable.scala │ │ ├── ScoreJob.scala │ │ └── StringUtilities.scala │ └── test │ └── scala │ ├── example │ ├── SimpleHBaseSourceExampleTest.scala │ └── WordCountTest.scala │ └── sandcrawler │ ├── CdxBackfillJob.scala │ ├── CrossrefScorableTest.scala │ ├── DumpUnGrobidedJobTest.scala │ ├── FatcatScorableTest.scala │ ├── GrobidScorableDumpJobTest.scala │ ├── GrobidScorableTest.scala │ ├── HBaseBuilderTest.scala │ ├── HBaseMimeCountTest.scala │ ├── HBaseRowCountTest.scala │ ├── HBaseStatusCodeCountTest.scala │ ├── HBaseStatusCountTest.scala │ ├── ScorableFeaturesTest.scala │ ├── ScorableTest.scala │ ├── ScoreInsertableJobTest.scala │ ├── ScoreJobTest.scala │ └── StringUtilitiesTest.scala └── sql ├── Makefile ├── README.md ├── backfill ├── backfill.md ├── backfill_cdx.py ├── backfill_file_meta.py ├── backfill_grobid.py ├── backfill_grobid_unpaywall.py ├── filter_transform_cdx.py └── petabox_transform.py ├── dump_file_meta.sql ├── dump_regrobid_pdf.sql ├── dump_regrobid_pdf_petabox.sql ├── dump_reingest_bulk.sql ├── dump_reingest_old.sql ├── dump_reingest_quarterly.sql ├── dump_reingest_spn.sql ├── dump_reingest_terminalstatus.sql ├── dump_reingest_weekly.sql ├── dump_unextracted_pdf.sql ├── dump_unextracted_pdf_petabox.sql ├── dump_ungrobid_pdf.sql ├── dump_ungrobid_pdf_petabox.sql ├── dump_unmatched_glutton_pdf.sql ├── example.env ├── ingest_again.md ├── ingest_stats ├── 2020-11-16_weekly_ingest_doi_prefix.txt └── 2020-11-16_weekly_ingest_terminal_domain.txt ├── migrations ├── 00000000000000_diesel_initial_setup │ ├── down.sql │ └── up.sql └── 2019-12-19-060141_init │ ├── down.sql │ └── up.sql ├── monitoring_queries.md ├── pdftrio_queries.md ├── random_queries.md ├── reingest_bulk.sh ├── reingest_old.sh ├── reingest_quarterly.sh ├── reingest_spn.sh ├── reingest_terminalstatus_forcerecrawl.sh ├── reingest_weekly.sh ├── sandcrawler_schema.sql ├── stats ├── 2020-01-13_stats.txt ├── 2020-01-31_supplement.txt ├── 2020-02-24_stats.txt ├── 2020-05-03_stats.txt ├── 2020-07-23_stats.txt ├── 2020-09-14_stats.txt ├── 2021-04-07_stats.txt ├── 2021-04-08_table_sizes.txt ├── 2021-04-12_ingest_domain_summary_30d.txt ├── 2021-11-01_table_sizes.txt ├── 2021-11-26_stats.txt ├── 2021-12-02_table_sizes.txt ├── 2022-04-26_stats.txt ├── 2022-04-27_crawl_changelog.txt ├── 2022-05-11_crawl_changelog.txt ├── 2022-09-06_stats.txt ├── 2022-11-23_table_sizes.txt └── README.md └── table_sizes.md /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/.gitlab-ci.yml -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/CONTRIBUTORS -------------------------------------------------------------------------------- /Dockerfile.sandcrawler-pytest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/Dockerfile.sandcrawler-pytest -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/README.md -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/TODO -------------------------------------------------------------------------------- /extra/RUNBOOK.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/RUNBOOK.md -------------------------------------------------------------------------------- /extra/blobs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/blobs/README.md -------------------------------------------------------------------------------- /extra/blobs/minio/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/blobs/minio/README.md -------------------------------------------------------------------------------- /extra/blobs/minio/minio.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/blobs/minio/minio.conf -------------------------------------------------------------------------------- /extra/blobs/seaweedfs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/blobs/seaweedfs/README.md -------------------------------------------------------------------------------- /extra/blobs/tasks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/blobs/tasks.md -------------------------------------------------------------------------------- /extra/docker/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/docker/README.md -------------------------------------------------------------------------------- /extra/docker/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/docker/docker-compose.yml -------------------------------------------------------------------------------- /extra/hbase/howto.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/hbase/howto.md -------------------------------------------------------------------------------- /extra/hbase/notes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/hbase/notes.txt -------------------------------------------------------------------------------- /extra/hbase/schema_design.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/hbase/schema_design.md -------------------------------------------------------------------------------- /extra/nginx/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/nginx/README.md -------------------------------------------------------------------------------- /extra/nginx/fatcat-blobs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/nginx/fatcat-blobs -------------------------------------------------------------------------------- /extra/nginx/sandcrawler-db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/nginx/sandcrawler-db -------------------------------------------------------------------------------- /extra/nginx/sandcrawler-minio: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/extra/nginx/sandcrawler-minio -------------------------------------------------------------------------------- /fetch_hadoop.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/fetch_hadoop.sh -------------------------------------------------------------------------------- /kafka/debugging_issues.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/kafka/debugging_issues.txt -------------------------------------------------------------------------------- /kafka/grobid_kafka_notes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/kafka/grobid_kafka_notes.txt -------------------------------------------------------------------------------- /kafka/howto_rebalance.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/kafka/howto_rebalance.md -------------------------------------------------------------------------------- /kafka/monitoring_commands.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/kafka/monitoring_commands.md -------------------------------------------------------------------------------- /kafka/topics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/kafka/topics.md -------------------------------------------------------------------------------- /match_test_data/NOTES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/match_test_data/NOTES.txt -------------------------------------------------------------------------------- /match_test_data/RESULTS.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/match_test_data/RESULTS.txt -------------------------------------------------------------------------------- /match_test_data/crossref_sample.bibjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/match_test_data/crossref_sample.bibjson -------------------------------------------------------------------------------- /match_test_data/grobid_sample.bibjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/match_test_data/grobid_sample.bibjson -------------------------------------------------------------------------------- /match_test_data/math_universe_releases.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/match_test_data/math_universe_releases.json -------------------------------------------------------------------------------- /notes/backfill_scalding_rewrite.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/backfill_scalding_rewrite.txt -------------------------------------------------------------------------------- /notes/crawl_cdx_merge.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/crawl_cdx_merge.md -------------------------------------------------------------------------------- /notes/dryad_datasets.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/dryad_datasets.md -------------------------------------------------------------------------------- /notes/examples/2021-11-12_broken_grobid_xml.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/examples/2021-11-12_broken_grobid_xml.md -------------------------------------------------------------------------------- /notes/examples/dataset_examples.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/examples/dataset_examples.txt -------------------------------------------------------------------------------- /notes/examples/html_test_journals.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/examples/html_test_journals.txt -------------------------------------------------------------------------------- /notes/examples/random_datasets.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/examples/random_datasets.md -------------------------------------------------------------------------------- /notes/fuzzy_match_notes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/fuzzy_match_notes.md -------------------------------------------------------------------------------- /notes/grobid_munging.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/grobid_munging.txt -------------------------------------------------------------------------------- /notes/hadoop_job_log.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/hadoop_job_log.md -------------------------------------------------------------------------------- /notes/hbase_table_sizes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/hbase_table_sizes.txt -------------------------------------------------------------------------------- /notes/html_ingest_notes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/html_ingest_notes.md -------------------------------------------------------------------------------- /notes/ingest/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.json 3 | -------------------------------------------------------------------------------- /notes/ingest/2019-10-23_testing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2019-10-23_testing.md -------------------------------------------------------------------------------- /notes/ingest/2020-01-14_bulk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-01-14_bulk.md -------------------------------------------------------------------------------- /notes/ingest/2020-02-04_ingest_backfills.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-02-04_ingest_backfills.md -------------------------------------------------------------------------------- /notes/ingest/2020-02-18_ingest_backfills.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-02-18_ingest_backfills.md -------------------------------------------------------------------------------- /notes/ingest/2020-02-21_ingest_backfills.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-02-21_ingest_backfills.md -------------------------------------------------------------------------------- /notes/ingest/2020-02-22_fixed_domain.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-02-22_fixed_domain.txt -------------------------------------------------------------------------------- /notes/ingest/2020-02_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-02_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2020-03-02_ingests.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-03-02_ingests.txt -------------------------------------------------------------------------------- /notes/ingest/2020-03-oa_but_not_marked.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-03-oa_but_not_marked.md -------------------------------------------------------------------------------- /notes/ingest/2020-03_mag.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-03_mag.md -------------------------------------------------------------------------------- /notes/ingest/2020-03_s2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-03_s2.md -------------------------------------------------------------------------------- /notes/ingest/2020-04-13_covid19.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-04-13_covid19.md -------------------------------------------------------------------------------- /notes/ingest/2020-04_datacite.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-04_datacite.md -------------------------------------------------------------------------------- /notes/ingest/2020-04_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-04_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2020-05_oai_pmh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-05_oai_pmh.md -------------------------------------------------------------------------------- /notes/ingest/2020-05_pubmed.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-05_pubmed.md -------------------------------------------------------------------------------- /notes/ingest/2020-07_mag.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-07_mag.md -------------------------------------------------------------------------------- /notes/ingest/2020-08_daily_improvements.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-08_daily_improvements.md -------------------------------------------------------------------------------- /notes/ingest/2020-09_oa_doi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-09_oa_doi.md -------------------------------------------------------------------------------- /notes/ingest/2020-09_reingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-09_reingest.md -------------------------------------------------------------------------------- /notes/ingest/2020-09_scielo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-09_scielo.md -------------------------------------------------------------------------------- /notes/ingest/2020-10_daily.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-10_daily.md -------------------------------------------------------------------------------- /notes/ingest/2020-10_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-10_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2020-11-04_arxiv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-11-04_arxiv.md -------------------------------------------------------------------------------- /notes/ingest/2020-11_doaj.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-11_doaj.md -------------------------------------------------------------------------------- /notes/ingest/2020-12-08_patch_crawl_notes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2020-12-08_patch_crawl_notes.md -------------------------------------------------------------------------------- /notes/ingest/2021-04_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-04_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2021-05_daily_improvements.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-05_daily_improvements.md -------------------------------------------------------------------------------- /notes/ingest/2021-07_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-07_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2021-08_mag.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-08_mag.md -------------------------------------------------------------------------------- /notes/ingest/2021-09-02_oai_pmh_patch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-09-02_oai_pmh_patch.md -------------------------------------------------------------------------------- /notes/ingest/2021-09-03_daily_improvements.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-09-03_daily_improvements.md -------------------------------------------------------------------------------- /notes/ingest/2021-09-03_patch_crawl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-09-03_patch_crawl.md -------------------------------------------------------------------------------- /notes/ingest/2021-12-13_datasets.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2021-12-13_datasets.md -------------------------------------------------------------------------------- /notes/ingest/2022-01-06_patch_crawl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-01-06_patch_crawl.md -------------------------------------------------------------------------------- /notes/ingest/2022-01-13_doi_crawl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-01-13_doi_crawl.md -------------------------------------------------------------------------------- /notes/ingest/2022-03_doaj.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-03_doaj.md -------------------------------------------------------------------------------- /notes/ingest/2022-03_oaipmh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-03_oaipmh.md -------------------------------------------------------------------------------- /notes/ingest/2022-04_targeted.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-04_targeted.md -------------------------------------------------------------------------------- /notes/ingest/2022-04_unpaywall.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-04_unpaywall.md -------------------------------------------------------------------------------- /notes/ingest/2022-07-15_ingest_fixes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-07-15_ingest_fixes.md -------------------------------------------------------------------------------- /notes/ingest/2022-07-19_dblp.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-07-19_dblp.md -------------------------------------------------------------------------------- /notes/ingest/2022-07_doaj.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-07_doaj.md -------------------------------------------------------------------------------- /notes/ingest/2022-07_targeted.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-07_targeted.md -------------------------------------------------------------------------------- /notes/ingest/2022-09_oaipmh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2022-09_oaipmh.md -------------------------------------------------------------------------------- /notes/ingest/2023-06_oaipmh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2023-06_oaipmh.md -------------------------------------------------------------------------------- /notes/ingest/2023-10_dimensions.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2023-10_dimensions.md -------------------------------------------------------------------------------- /notes/ingest/2023-12_oaipmh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/2023-12_oaipmh.md -------------------------------------------------------------------------------- /notes/ingest/NEXT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/NEXT.md -------------------------------------------------------------------------------- /notes/ingest/es_csv_to_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest/es_csv_to_json.py -------------------------------------------------------------------------------- /notes/ingest_domains.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/ingest_domains.txt -------------------------------------------------------------------------------- /notes/library_shopping.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/library_shopping.txt -------------------------------------------------------------------------------- /notes/match_filter_enrich.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/match_filter_enrich.txt -------------------------------------------------------------------------------- /notes/old_extract_results.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/old_extract_results.txt -------------------------------------------------------------------------------- /notes/petabox_ia_metadata.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/petabox_ia_metadata.txt -------------------------------------------------------------------------------- /notes/possible_ingest_targets.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/possible_ingest_targets.txt -------------------------------------------------------------------------------- /notes/sandcrawler_worker_failures.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/sandcrawler_worker_failures.md -------------------------------------------------------------------------------- /notes/tasks/2020-01-06_heuristic_cdx.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-01-06_heuristic_cdx.txt -------------------------------------------------------------------------------- /notes/tasks/2020-01-27_cleanup_cdx.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-01-27_cleanup_cdx.md -------------------------------------------------------------------------------- /notes/tasks/2020-01-27_grobid_backfill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-01-27_grobid_backfill.md -------------------------------------------------------------------------------- /notes/tasks/2020-02-14_pdftrio.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-02-14_pdftrio.md -------------------------------------------------------------------------------- /notes/tasks/2020-07-22_processing_holes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-07-22_processing_holes.md -------------------------------------------------------------------------------- /notes/tasks/2020-08-20_file_meta.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-08-20_file_meta.md -------------------------------------------------------------------------------- /notes/tasks/2020-10-21_pdfextract_holes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2020-10-21_pdfextract_holes.md -------------------------------------------------------------------------------- /notes/tasks/2021-09-09_pdf_url_lists.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2021-09-09_pdf_url_lists.md -------------------------------------------------------------------------------- /notes/tasks/2021-10-29_crossref_refs_backfill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2021-10-29_crossref_refs_backfill.md -------------------------------------------------------------------------------- /notes/tasks/2021-12-06_regrobid.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2021-12-06_regrobid.md -------------------------------------------------------------------------------- /notes/tasks/2022-01-07_grobid_platform_pdfs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2022-01-07_grobid_platform_pdfs.md -------------------------------------------------------------------------------- /notes/tasks/2022-03-07_ukraine_firedrill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2022-03-07_ukraine_firedrill.md -------------------------------------------------------------------------------- /notes/tasks/2022-04-27_pdf_url_lists.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2022-04-27_pdf_url_lists.md -------------------------------------------------------------------------------- /notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md -------------------------------------------------------------------------------- /notes/url_pattern_heuristic_backfill.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/url_pattern_heuristic_backfill.txt -------------------------------------------------------------------------------- /notes/url_pattern_heuristic_verification.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/notes/url_pattern_heuristic_verification.txt -------------------------------------------------------------------------------- /pig/.gitignore: -------------------------------------------------------------------------------- 1 | deps 2 | *.log 3 | -------------------------------------------------------------------------------- /pig/Pipfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/Pipfile -------------------------------------------------------------------------------- /pig/Pipfile.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/Pipfile.lock -------------------------------------------------------------------------------- /pig/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/README.md -------------------------------------------------------------------------------- /pig/filter-cdx-join-urls.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-join-urls.pig -------------------------------------------------------------------------------- /pig/filter-cdx-paper-pdfs.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-paper-pdfs.pig -------------------------------------------------------------------------------- /pig/filter-cdx-pdfs.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-pdfs.pig -------------------------------------------------------------------------------- /pig/filter-cdx-ps.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-ps.pig -------------------------------------------------------------------------------- /pig/filter-cdx-source-code-crude.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-source-code-crude.pig -------------------------------------------------------------------------------- /pig/filter-cdx-tarball.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/filter-cdx-tarball.pig -------------------------------------------------------------------------------- /pig/hbase-count-rows.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/hbase-count-rows.pig -------------------------------------------------------------------------------- /pig/join-cdx-sha1.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/join-cdx-sha1.pig -------------------------------------------------------------------------------- /pig/pytest.ini: -------------------------------------------------------------------------------- 1 | 2 | [pytest] 3 | norecursedirs = deps 4 | -------------------------------------------------------------------------------- /pig/tests/files/example.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/example.cdx -------------------------------------------------------------------------------- /pig/tests/files/example.sha1b32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/example.sha1b32 -------------------------------------------------------------------------------- /pig/tests/files/papers_domain_words.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/papers_domain_words.cdx -------------------------------------------------------------------------------- /pig/tests/files/papers_edu_tilde.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/papers_edu_tilde.cdx -------------------------------------------------------------------------------- /pig/tests/files/papers_url_doi.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/papers_url_doi.cdx -------------------------------------------------------------------------------- /pig/tests/files/papers_url_words.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/papers_url_words.cdx -------------------------------------------------------------------------------- /pig/tests/files/sourcecode.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/sourcecode.cdx -------------------------------------------------------------------------------- /pig/tests/files/tarballs.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/files/tarballs.cdx -------------------------------------------------------------------------------- /pig/tests/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/log4j.properties -------------------------------------------------------------------------------- /pig/tests/pig.properties: -------------------------------------------------------------------------------- 1 | log4jconf=./tests/log4j.properties 2 | stop.on.failure=true 3 | -------------------------------------------------------------------------------- /pig/tests/pighelper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/pighelper.py -------------------------------------------------------------------------------- /pig/tests/test_filter_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/test_filter_cdx.py -------------------------------------------------------------------------------- /pig/tests/test_filter_cdx_paper_pdfs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/test_filter_cdx_paper_pdfs.py -------------------------------------------------------------------------------- /pig/tests/test_filter_software.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/test_filter_software.py -------------------------------------------------------------------------------- /pig/tests/test_join_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/pig/tests/test_join_cdx.py -------------------------------------------------------------------------------- /please: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/please -------------------------------------------------------------------------------- /proposals/2018_original_sandcrawler_rfc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2018_original_sandcrawler_rfc.md -------------------------------------------------------------------------------- /proposals/2019_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2019_ingest.md -------------------------------------------------------------------------------- /proposals/2019_pdftotext_pdfinfo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2019_pdftotext_pdfinfo.md -------------------------------------------------------------------------------- /proposals/20200129_pdf_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20200129_pdf_ingest.md -------------------------------------------------------------------------------- /proposals/20200207_pdftrio.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20200207_pdftrio.md -------------------------------------------------------------------------------- /proposals/20200211_nsq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20200211_nsq.md -------------------------------------------------------------------------------- /proposals/20201012_no_capture.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20201012_no_capture.md -------------------------------------------------------------------------------- /proposals/20201026_html_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20201026_html_ingest.md -------------------------------------------------------------------------------- /proposals/20201103_xml_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/20201103_xml_ingest.md -------------------------------------------------------------------------------- /proposals/2020_pdf_meta_thumbnails.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2020_pdf_meta_thumbnails.md -------------------------------------------------------------------------------- /proposals/2020_seaweed_s3.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2020_seaweed_s3.md -------------------------------------------------------------------------------- /proposals/2021-04-22_crossref_db.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-04-22_crossref_db.md -------------------------------------------------------------------------------- /proposals/2021-09-09_component_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-09-09_component_ingest.md -------------------------------------------------------------------------------- /proposals/2021-09-09_fileset_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-09-09_fileset_ingest.md -------------------------------------------------------------------------------- /proposals/2021-09-13_src_ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-09-13_src_ingest.md -------------------------------------------------------------------------------- /proposals/2021-09-21_spn_accounts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-09-21_spn_accounts.md -------------------------------------------------------------------------------- /proposals/2021-10-28_grobid_refs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-10-28_grobid_refs.md -------------------------------------------------------------------------------- /proposals/2021-12-09_trawling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/2021-12-09_trawling.md -------------------------------------------------------------------------------- /proposals/brainstorm/2021-debug_web_interface.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/brainstorm/2021-debug_web_interface.md -------------------------------------------------------------------------------- /proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md -------------------------------------------------------------------------------- /proposals/schema_changes.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/proposals/schema_changes.sql -------------------------------------------------------------------------------- /python/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = tests/* 3 | source = 4 | sandcrawler 5 | -------------------------------------------------------------------------------- /python/.flake8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/.flake8 -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/.gitignore -------------------------------------------------------------------------------- /python/.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/.pylintrc -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/Makefile -------------------------------------------------------------------------------- /python/Pipfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/Pipfile -------------------------------------------------------------------------------- /python/Pipfile.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/Pipfile.lock -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/README.md -------------------------------------------------------------------------------- /python/example.env: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/example.env -------------------------------------------------------------------------------- /python/grobid_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/grobid_tool.py -------------------------------------------------------------------------------- /python/ia_pdf_match.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/ia_pdf_match.py -------------------------------------------------------------------------------- /python/ingest_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/ingest_tool.py -------------------------------------------------------------------------------- /python/pdfextract_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/pdfextract_tool.py -------------------------------------------------------------------------------- /python/pdftrio_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/pdftrio_tool.py -------------------------------------------------------------------------------- /python/persist_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/persist_tool.py -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/pyproject.toml -------------------------------------------------------------------------------- /python/pytest.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/pytest.ini -------------------------------------------------------------------------------- /python/sandcrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/__init__.py -------------------------------------------------------------------------------- /python/sandcrawler/db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/db.py -------------------------------------------------------------------------------- /python/sandcrawler/fileset_platforms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/fileset_platforms.py -------------------------------------------------------------------------------- /python/sandcrawler/fileset_strategies.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/fileset_strategies.py -------------------------------------------------------------------------------- /python/sandcrawler/fileset_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/fileset_types.py -------------------------------------------------------------------------------- /python/sandcrawler/grobid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/grobid.py -------------------------------------------------------------------------------- /python/sandcrawler/html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/html.py -------------------------------------------------------------------------------- /python/sandcrawler/html_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/html_metadata.py -------------------------------------------------------------------------------- /python/sandcrawler/ia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/ia.py -------------------------------------------------------------------------------- /python/sandcrawler/ingest_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/ingest_file.py -------------------------------------------------------------------------------- /python/sandcrawler/ingest_fileset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/ingest_fileset.py -------------------------------------------------------------------------------- /python/sandcrawler/ingest_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/ingest_html.py -------------------------------------------------------------------------------- /python/sandcrawler/minio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/minio.py -------------------------------------------------------------------------------- /python/sandcrawler/misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/misc.py -------------------------------------------------------------------------------- /python/sandcrawler/pdfextract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/pdfextract.py -------------------------------------------------------------------------------- /python/sandcrawler/pdftrio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/pdftrio.py -------------------------------------------------------------------------------- /python/sandcrawler/persist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/persist.py -------------------------------------------------------------------------------- /python/sandcrawler/workers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/workers.py -------------------------------------------------------------------------------- /python/sandcrawler/xml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler/xml.py -------------------------------------------------------------------------------- /python/sandcrawler_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/sandcrawler_worker.py -------------------------------------------------------------------------------- /python/scripts/arabesque2ingestrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/arabesque2ingestrequest.py -------------------------------------------------------------------------------- /python/scripts/archiveorg_fileset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/archiveorg_fileset.py -------------------------------------------------------------------------------- /python/scripts/cdx_collection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/cdx_collection.py -------------------------------------------------------------------------------- /python/scripts/covid2ingestrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/covid2ingestrequest.py -------------------------------------------------------------------------------- /python/scripts/deliver_dumpgrobid_to_s3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/deliver_dumpgrobid_to_s3.py -------------------------------------------------------------------------------- /python/scripts/deliver_gwb_to_disk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/deliver_gwb_to_disk.py -------------------------------------------------------------------------------- /python/scripts/deliver_gwb_to_s3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/deliver_gwb_to_s3.py -------------------------------------------------------------------------------- /python/scripts/doaj2ingestrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/doaj2ingestrequest.py -------------------------------------------------------------------------------- /python/scripts/enrich_scored_matches.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/enrich_scored_matches.py -------------------------------------------------------------------------------- /python/scripts/fetch_cdx_sha1hex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/fetch_cdx_sha1hex.py -------------------------------------------------------------------------------- /python/scripts/filter_grobid_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/filter_grobid_metadata.py -------------------------------------------------------------------------------- /python/scripts/filter_groupworks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/filter_groupworks.py -------------------------------------------------------------------------------- /python/scripts/filter_scored_matches.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/filter_scored_matches.py -------------------------------------------------------------------------------- /python/scripts/grobid_affiliations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/grobid_affiliations.py -------------------------------------------------------------------------------- /python/scripts/import_grobid_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/import_grobid_metadata.py -------------------------------------------------------------------------------- /python/scripts/ingestrequest_row2json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/ingestrequest_row2json.py -------------------------------------------------------------------------------- /python/scripts/manifest_converter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/manifest_converter.py -------------------------------------------------------------------------------- /python/scripts/oai2ingestrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/oai2ingestrequest.py -------------------------------------------------------------------------------- /python/scripts/pdf_thumbnail.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/pdf_thumbnail.py -------------------------------------------------------------------------------- /python/scripts/unpaywall2ingestrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/scripts/unpaywall2ingestrequest.py -------------------------------------------------------------------------------- /python/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml -------------------------------------------------------------------------------- /python/tests/files/crossref_api_work_978-3-030-64953-1_4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json -------------------------------------------------------------------------------- /python/tests/files/crossref_api_work_s1047951103000064.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/crossref_api_work_s1047951103000064.json -------------------------------------------------------------------------------- /python/tests/files/dlib_05vanhyning.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dlib_05vanhyning.html -------------------------------------------------------------------------------- /python/tests/files/dummy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy.pdf -------------------------------------------------------------------------------- /python/tests/files/dummy_zip.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy_zip.zip -------------------------------------------------------------------------------- /python/tests/files/elife_article.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/elife_article.html -------------------------------------------------------------------------------- /python/tests/files/example.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/example.cdx -------------------------------------------------------------------------------- /python/tests/files/example_grobid_metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/example_grobid_metadata.json -------------------------------------------------------------------------------- /python/tests/files/first_monday_ojs3_fulltext.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/first_monday_ojs3_fulltext.html -------------------------------------------------------------------------------- /python/tests/files/first_monday_ojs3_landingpage.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/first_monday_ojs3_landingpage.html -------------------------------------------------------------------------------- /python/tests/files/genders_g58_fairlie.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/genders_g58_fairlie.html -------------------------------------------------------------------------------- /python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml -------------------------------------------------------------------------------- /python/tests/files/grobid_refs_s1047951103000064.tei.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/grobid_refs_s1047951103000064.tei.xml -------------------------------------------------------------------------------- /python/tests/files/nature_article.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/nature_article.html -------------------------------------------------------------------------------- /python/tests/files/peerj_oa_article.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/peerj_oa_article.html -------------------------------------------------------------------------------- /python/tests/files/plos_one_article.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/plos_one_article.html -------------------------------------------------------------------------------- /python/tests/files/plos_one_article_no_icon_href.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/plos_one_article_no_icon_href.html -------------------------------------------------------------------------------- /python/tests/files/scielo_article.jats.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/scielo_article.jats.xml -------------------------------------------------------------------------------- /python/tests/files/small.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/small.json -------------------------------------------------------------------------------- /python/tests/files/small.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/small.xml -------------------------------------------------------------------------------- /python/tests/test_grobid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_grobid.py -------------------------------------------------------------------------------- /python/tests/test_grobid2json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_grobid2json.py -------------------------------------------------------------------------------- /python/tests/test_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_html.py -------------------------------------------------------------------------------- /python/tests/test_html_ingest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_html_ingest.py -------------------------------------------------------------------------------- /python/tests/test_html_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_html_metadata.py -------------------------------------------------------------------------------- /python/tests/test_ingest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_ingest.py -------------------------------------------------------------------------------- /python/tests/test_ingest_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_ingest_html.py -------------------------------------------------------------------------------- /python/tests/test_live_wayback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_live_wayback.py -------------------------------------------------------------------------------- /python/tests/test_misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_misc.py -------------------------------------------------------------------------------- /python/tests/test_pdfextract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_pdfextract.py -------------------------------------------------------------------------------- /python/tests/test_pushers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_pushers.py -------------------------------------------------------------------------------- /python/tests/test_savepagenow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_savepagenow.py -------------------------------------------------------------------------------- /python/tests/test_wayback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_wayback.py -------------------------------------------------------------------------------- /python/tests/test_xml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/test_xml.py -------------------------------------------------------------------------------- /python/title_slug_denylist.txt: -------------------------------------------------------------------------------- 1 | ../scalding/src/main/resources/slug-denylist.txt -------------------------------------------------------------------------------- /python_hadoop/Pipfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/Pipfile -------------------------------------------------------------------------------- /python_hadoop/Pipfile.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/Pipfile.lock -------------------------------------------------------------------------------- /python_hadoop/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/README.md -------------------------------------------------------------------------------- /python_hadoop/backfill_hbase_from_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/backfill_hbase_from_cdx.py -------------------------------------------------------------------------------- /python_hadoop/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/common.py -------------------------------------------------------------------------------- /python_hadoop/extraction_cdx_grobid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/extraction_cdx_grobid.py -------------------------------------------------------------------------------- /python_hadoop/extraction_ungrobided.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/extraction_ungrobided.py -------------------------------------------------------------------------------- /python_hadoop/grobid2json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/grobid2json.py -------------------------------------------------------------------------------- /python_hadoop/kafka_grobid_hbase.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/kafka_grobid_hbase.py -------------------------------------------------------------------------------- /python_hadoop/mrjob.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/mrjob.conf -------------------------------------------------------------------------------- /python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml -------------------------------------------------------------------------------- /python_hadoop/tests/files/example.cdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/example.cdx -------------------------------------------------------------------------------- /python_hadoop/tests/files/example_grobid_metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/example_grobid_metadata.json -------------------------------------------------------------------------------- /python_hadoop/tests/files/example_ungrobided.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/example_ungrobided.tsv -------------------------------------------------------------------------------- /python_hadoop/tests/files/small.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/small.json -------------------------------------------------------------------------------- /python_hadoop/tests/files/small.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/files/small.xml -------------------------------------------------------------------------------- /python_hadoop/tests/test_backfill_hbase_from_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/test_backfill_hbase_from_cdx.py -------------------------------------------------------------------------------- /python_hadoop/tests/test_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/test_common.py -------------------------------------------------------------------------------- /python_hadoop/tests/test_extraction_cdx_grobid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/test_extraction_cdx_grobid.py -------------------------------------------------------------------------------- /python_hadoop/tests/test_extraction_ungrobided.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/test_extraction_ungrobided.py -------------------------------------------------------------------------------- /python_hadoop/tests/test_grobid2json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python_hadoop/tests/test_grobid2json.py -------------------------------------------------------------------------------- /scalding/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/.gitignore -------------------------------------------------------------------------------- /scalding/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/README.md -------------------------------------------------------------------------------- /scalding/build.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/build.sbt -------------------------------------------------------------------------------- /scalding/ia_cluster.conf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding/project/Dependencies.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/project/Dependencies.scala -------------------------------------------------------------------------------- /scalding/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.1 2 | -------------------------------------------------------------------------------- /scalding/project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/project/plugins.sbt -------------------------------------------------------------------------------- /scalding/scalastyle-config.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/scalastyle-config.xml -------------------------------------------------------------------------------- /scalding/scalding-background.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/scalding-background.md -------------------------------------------------------------------------------- /scalding/scalding-debugging.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/scalding-debugging.md -------------------------------------------------------------------------------- /scalding/src/main/resources/slug-denylist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/resources/slug-denylist.txt -------------------------------------------------------------------------------- /scalding/src/main/scala/example/SimpleHBaseSourceExample.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/example/SimpleHBaseSourceExample.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/example/WordCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/example/WordCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/BibjsonScorable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/CrossrefScorable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/FatcatScorable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/FatcatScorable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GrobidScorable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/GrobidScorable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseBuilder.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseBuilder.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseMimeCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseMimeCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/Scorable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/Scorable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/ScorableFeatures.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/ScoreInsertable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/ScoreJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/ScoreJob.scala -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/StringUtilities.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/main/scala/sandcrawler/StringUtilities.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/example/SimpleHBaseSourceExampleTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/example/SimpleHBaseSourceExampleTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/example/WordCountTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/example/WordCountTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/ScorableTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/ScorableTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/ScoreJobTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala -------------------------------------------------------------------------------- /sql/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/Makefile -------------------------------------------------------------------------------- /sql/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/README.md -------------------------------------------------------------------------------- /sql/backfill/backfill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/backfill.md -------------------------------------------------------------------------------- /sql/backfill/backfill_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/backfill_cdx.py -------------------------------------------------------------------------------- /sql/backfill/backfill_file_meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/backfill_file_meta.py -------------------------------------------------------------------------------- /sql/backfill/backfill_grobid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/backfill_grobid.py -------------------------------------------------------------------------------- /sql/backfill/backfill_grobid_unpaywall.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/backfill_grobid_unpaywall.py -------------------------------------------------------------------------------- /sql/backfill/filter_transform_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/filter_transform_cdx.py -------------------------------------------------------------------------------- /sql/backfill/petabox_transform.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/backfill/petabox_transform.py -------------------------------------------------------------------------------- /sql/dump_file_meta.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_file_meta.sql -------------------------------------------------------------------------------- /sql/dump_regrobid_pdf.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_regrobid_pdf.sql -------------------------------------------------------------------------------- /sql/dump_regrobid_pdf_petabox.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_regrobid_pdf_petabox.sql -------------------------------------------------------------------------------- /sql/dump_reingest_bulk.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_bulk.sql -------------------------------------------------------------------------------- /sql/dump_reingest_old.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_old.sql -------------------------------------------------------------------------------- /sql/dump_reingest_quarterly.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_quarterly.sql -------------------------------------------------------------------------------- /sql/dump_reingest_spn.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_spn.sql -------------------------------------------------------------------------------- /sql/dump_reingest_terminalstatus.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_terminalstatus.sql -------------------------------------------------------------------------------- /sql/dump_reingest_weekly.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_reingest_weekly.sql -------------------------------------------------------------------------------- /sql/dump_unextracted_pdf.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_unextracted_pdf.sql -------------------------------------------------------------------------------- /sql/dump_unextracted_pdf_petabox.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_unextracted_pdf_petabox.sql -------------------------------------------------------------------------------- /sql/dump_ungrobid_pdf.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_ungrobid_pdf.sql -------------------------------------------------------------------------------- /sql/dump_ungrobid_pdf_petabox.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_ungrobid_pdf_petabox.sql -------------------------------------------------------------------------------- /sql/dump_unmatched_glutton_pdf.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/dump_unmatched_glutton_pdf.sql -------------------------------------------------------------------------------- /sql/example.env: -------------------------------------------------------------------------------- 1 | DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler" 2 | -------------------------------------------------------------------------------- /sql/ingest_again.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/ingest_again.md -------------------------------------------------------------------------------- /sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt -------------------------------------------------------------------------------- /sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt -------------------------------------------------------------------------------- /sql/migrations/00000000000000_diesel_initial_setup/down.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/migrations/00000000000000_diesel_initial_setup/down.sql -------------------------------------------------------------------------------- /sql/migrations/00000000000000_diesel_initial_setup/up.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/migrations/00000000000000_diesel_initial_setup/up.sql -------------------------------------------------------------------------------- /sql/migrations/2019-12-19-060141_init/down.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/migrations/2019-12-19-060141_init/down.sql -------------------------------------------------------------------------------- /sql/migrations/2019-12-19-060141_init/up.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/migrations/2019-12-19-060141_init/up.sql -------------------------------------------------------------------------------- /sql/monitoring_queries.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/monitoring_queries.md -------------------------------------------------------------------------------- /sql/pdftrio_queries.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/pdftrio_queries.md -------------------------------------------------------------------------------- /sql/random_queries.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/random_queries.md -------------------------------------------------------------------------------- /sql/reingest_bulk.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_bulk.sh -------------------------------------------------------------------------------- /sql/reingest_old.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_old.sh -------------------------------------------------------------------------------- /sql/reingest_quarterly.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_quarterly.sh -------------------------------------------------------------------------------- /sql/reingest_spn.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_spn.sh -------------------------------------------------------------------------------- /sql/reingest_terminalstatus_forcerecrawl.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_terminalstatus_forcerecrawl.sh -------------------------------------------------------------------------------- /sql/reingest_weekly.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/reingest_weekly.sh -------------------------------------------------------------------------------- /sql/sandcrawler_schema.sql: -------------------------------------------------------------------------------- 1 | migrations/2019-12-19-060141_init/up.sql -------------------------------------------------------------------------------- /sql/stats/2020-01-13_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-01-13_stats.txt -------------------------------------------------------------------------------- /sql/stats/2020-01-31_supplement.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-01-31_supplement.txt -------------------------------------------------------------------------------- /sql/stats/2020-02-24_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-02-24_stats.txt -------------------------------------------------------------------------------- /sql/stats/2020-05-03_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-05-03_stats.txt -------------------------------------------------------------------------------- /sql/stats/2020-07-23_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-07-23_stats.txt -------------------------------------------------------------------------------- /sql/stats/2020-09-14_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2020-09-14_stats.txt -------------------------------------------------------------------------------- /sql/stats/2021-04-07_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-04-07_stats.txt -------------------------------------------------------------------------------- /sql/stats/2021-04-08_table_sizes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-04-08_table_sizes.txt -------------------------------------------------------------------------------- /sql/stats/2021-04-12_ingest_domain_summary_30d.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-04-12_ingest_domain_summary_30d.txt -------------------------------------------------------------------------------- /sql/stats/2021-11-01_table_sizes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-11-01_table_sizes.txt -------------------------------------------------------------------------------- /sql/stats/2021-11-26_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-11-26_stats.txt -------------------------------------------------------------------------------- /sql/stats/2021-12-02_table_sizes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2021-12-02_table_sizes.txt -------------------------------------------------------------------------------- /sql/stats/2022-04-26_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2022-04-26_stats.txt -------------------------------------------------------------------------------- /sql/stats/2022-04-27_crawl_changelog.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2022-04-27_crawl_changelog.txt -------------------------------------------------------------------------------- /sql/stats/2022-05-11_crawl_changelog.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2022-05-11_crawl_changelog.txt -------------------------------------------------------------------------------- /sql/stats/2022-09-06_stats.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2022-09-06_stats.txt -------------------------------------------------------------------------------- /sql/stats/2022-11-23_table_sizes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/2022-11-23_table_sizes.txt -------------------------------------------------------------------------------- /sql/stats/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/stats/README.md -------------------------------------------------------------------------------- /sql/table_sizes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/sql/table_sizes.md --------------------------------------------------------------------------------