├── scalding ├── ia_cluster.conf ├── project │ ├── build.properties │ ├── Dependencies.scala │ └── plugins.sbt ├── .gitignore ├── src │ ├── main │ │ └── scala │ │ │ ├── sandcrawler │ │ │ ├── HBaseMimeCountJob.scala │ │ │ ├── HBaseRowCountJob.scala │ │ │ ├── HBaseColCountJob.scala │ │ │ ├── HBaseStatusCountJob.scala │ │ │ ├── HBaseStatusCodeCountJob.scala │ │ │ ├── HBaseCountJob.scala │ │ │ ├── MatchBenchmarkJob.scala │ │ │ ├── DumpGrobidStatusCodeJob.scala │ │ │ ├── DumpFileMetaJob.scala │ │ │ ├── GroupFatcatWorksJob.scala │ │ │ ├── BibjsonScorable.scala │ │ │ ├── DumpGrobidXmlJob.scala │ │ │ ├── ScoreJob.scala │ │ │ ├── DumpGrobidMetaInsertableJob.scala │ │ │ ├── GroupFatcatWorksSubsetJob.scala │ │ │ ├── MissingColumnDumpJob.scala │ │ │ ├── DumpUnGrobidedJob.scala │ │ │ └── GrobidScorableDumpJob.scala │ │ │ └── example │ │ │ ├── WordCountJob.scala │ │ │ └── SimpleHBaseSourceExample.scala │ └── test │ │ └── scala │ │ ├── example │ │ ├── WordCountTest.scala │ │ └── SimpleHBaseSourceExampleTest.scala │ │ └── sandcrawler │ │ ├── HBaseBuilderTest.scala │ │ ├── HBaseRowCountTest.scala │ │ └── ScorableFeaturesTest.scala └── README.md ├── pig ├── .gitignore ├── pytest.ini ├── tests │ ├── pig.properties │ ├── files │ │ ├── example.sha1b32 │ │ ├── papers_url_doi.cdx │ │ ├── tarballs.cdx │ │ ├── sourcecode.cdx │ │ ├── papers_domain_words.cdx │ │ └── papers_edu_tilde.cdx │ ├── test_filter_cdx.py │ ├── log4j.properties │ ├── test_filter_software.py │ ├── test_filter_cdx_paper_pdfs.py │ └── test_join_cdx.py ├── Pipfile ├── hbase-count-rows.pig ├── filter-cdx-pdfs.pig ├── filter-cdx-ps.pig ├── README.md ├── filter-cdx-tarball.pig ├── filter-cdx-source-code-crude.pig ├── filter-cdx-join-urls.pig ├── join-cdx-sha1.pig └── filter-cdx-paper-pdfs.pig ├── notes ├── ingest │ ├── .gitignore │ ├── 2019-10-23_testing.md │ ├── 2020-05_pubmed.md │ ├── 2020-11-04_arxiv.md │ ├── 2020-09_scielo.md │ ├── es_csv_to_json.py │ ├── 2020-03-oa_but_not_marked.md │ ├── 2023-10_dimensions.md │ ├── 2020-01-14_bulk.md │ ├── 2022-03_oaipmh.md │ ├── NEXT.md │ ├── 2020-03_s2.md │ ├── 2020-02-18_ingest_backfills.md │ └── 2022-07-19_dblp.md ├── hbase_table_sizes.txt ├── library_shopping.txt ├── dryad_datasets.md ├── examples │ ├── random_datasets.md │ └── dataset_examples.txt ├── backfill_scalding_rewrite.txt ├── crawl_cdx_merge.md ├── possible_ingest_targets.txt ├── tasks │ ├── 2022-01-07_grobid_platform_pdfs.md │ ├── 2020-01-27_cleanup_cdx.md │ ├── 2020-01-06_heuristic_cdx.txt │ ├── 2021-09-09_pdf_url_lists.md │ └── 2020-08-20_file_meta.md ├── match_filter_enrich.txt ├── old_extract_results.txt └── petabox_ia_metadata.txt ├── sql ├── sandcrawler_schema.sql ├── example.env ├── dump_file_meta.sql ├── migrations │ ├── 2019-12-19-060141_init │ │ └── down.sql │ └── 00000000000000_diesel_initial_setup │ │ ├── down.sql │ │ └── up.sql ├── dump_regrobid_pdf.sql ├── dump_unmatched_glutton_pdf.sql ├── dump_unextracted_pdf_petabox.sql ├── dump_regrobid_pdf_petabox.sql ├── reingest_spn.sh ├── backfill │ ├── petabox_transform.py │ ├── backfill_file_meta.py │ └── backfill_grobid_unpaywall.py ├── dump_ungrobid_pdf_petabox.sql ├── dump_ungrobid_pdf.sql ├── reingest_old.sh ├── reingest_bulk.sh ├── reingest_terminalstatus_forcerecrawl.sh ├── dump_unextracted_pdf.sql ├── table_sizes.md ├── reingest_weekly.sh ├── reingest_quarterly.sh ├── stats │ ├── 2021-11-01_table_sizes.txt │ ├── 2021-12-02_table_sizes.txt │ ├── 2020-01-31_supplement.txt │ ├── 2022-11-23_table_sizes.txt │ └── 2021-04-08_table_sizes.txt ├── Makefile ├── dump_reingest_bulk.sql ├── dump_reingest_terminalstatus.sql ├── dump_reingest_spn.sql ├── dump_reingest_old.sql ├── dump_reingest_weekly.sql └── pdftrio_queries.md ├── python ├── title_slug_denylist.txt ├── .coveragerc ├── tests │ ├── files │ │ ├── dummy.pdf │ │ ├── dummy_zip.zip │ │ ├── scielo_article.jats.xml │ │ ├── genders_g58_fairlie.html │ │ └── small.json │ ├── test_html.py │ ├── test_html_ingest.py │ ├── test_ingest_html.py │ ├── test_xml.py │ ├── test_grobid2json.py │ ├── test_pushers.py │ └── test_pdfextract.py ├── pyproject.toml ├── .gitignore ├── sandcrawler │ ├── xml.py │ ├── __init__.py │ └── fileset_types.py ├── example.env ├── .pylintrc ├── .flake8 ├── Makefile ├── pytest.ini ├── scripts │ ├── pdf_thumbnail.py │ ├── enrich_scored_matches.py │ ├── ingestrequest_row2json.py │ ├── manifest_converter.py │ ├── grobid_affiliations.py │ └── covid2ingestrequest.py ├── Pipfile └── README.md ├── CONTRIBUTORS ├── extra ├── blobs │ ├── seaweedfs │ │ └── README.md │ ├── minio │ │ └── minio.conf │ └── tasks.md ├── docker │ ├── README.md │ └── docker-compose.yml ├── nginx │ ├── README.md │ ├── fatcat-blobs │ └── sandcrawler-minio ├── hbase │ └── howto.md └── RUNBOOK.md ├── kafka ├── monitoring_commands.md ├── howto_rebalance.md └── debugging_issues.txt ├── proposals ├── brainstorm │ ├── 2021-debug_web_interface.md │ └── 2022-04-18_automated_heritrix_crawling.md ├── 2021-09-21_spn_accounts.md ├── 2021-09-13_src_ingest.md ├── 20201012_no_capture.md └── schema_changes.sql ├── .gitignore ├── python_hadoop ├── mrjob.conf ├── tests │ ├── test_grobid2json.py │ └── files │ │ └── small.json └── Pipfile ├── match_test_data ├── NOTES.txt └── RESULTS.txt ├── Dockerfile.sandcrawler-pytest ├── fetch_hadoop.sh ├── .gitlab-ci.yml └── TODO /scalding/ia_cluster.conf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pig/.gitignore: -------------------------------------------------------------------------------- 1 | deps 2 | *.log 3 | -------------------------------------------------------------------------------- /notes/ingest/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.json 3 | -------------------------------------------------------------------------------- /pig/pytest.ini: -------------------------------------------------------------------------------- 1 | 2 | [pytest] 3 | norecursedirs = deps 4 | -------------------------------------------------------------------------------- /scalding/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.1 2 | -------------------------------------------------------------------------------- /sql/sandcrawler_schema.sql: -------------------------------------------------------------------------------- 1 | migrations/2019-12-19-060141_init/up.sql -------------------------------------------------------------------------------- /scalding/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | project/project/ 3 | project/targer/ 4 | -------------------------------------------------------------------------------- /python/title_slug_denylist.txt: -------------------------------------------------------------------------------- 1 | ../scalding/src/main/resources/slug-denylist.txt -------------------------------------------------------------------------------- /python/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = tests/* 3 | source = 4 | sandcrawler 5 | -------------------------------------------------------------------------------- /sql/example.env: -------------------------------------------------------------------------------- 1 | DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler" 2 | -------------------------------------------------------------------------------- /pig/tests/pig.properties: -------------------------------------------------------------------------------- 1 | log4jconf=./tests/log4j.properties 2 | stop.on.failure=true 3 | -------------------------------------------------------------------------------- /python/tests/files/dummy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy.pdf -------------------------------------------------------------------------------- /python/tests/files/dummy_zip.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy_zip.zip -------------------------------------------------------------------------------- /python/tests/files/scielo_article.jats.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/scielo_article.jats.xml -------------------------------------------------------------------------------- /python/tests/files/genders_g58_fairlie.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/genders_g58_fairlie.html -------------------------------------------------------------------------------- /scalding/project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.5" 5 | } 6 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Bryan Newbold 2 | 3 | Ellen Spertus transfers copyright of all of her contributions to the 4 | repository in exchange for one Internet Archive Sticker, received. -------------------------------------------------------------------------------- /pig/tests/files/example.sha1b32: -------------------------------------------------------------------------------- 1 | EJWYVOPONJRARK7SGG6COFRN7CSTHROY 2 | V32E3CCO7NMI2M4OHLKG73DXD72LR4B2 3 | 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ 4 | E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK 5 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta:__legacy__" 4 | 5 | [tool.isort] 6 | profile = "black" 7 | line_length = 96 8 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | *part-000* 2 | *.tar.gz 3 | *.gz 4 | htmlcov/ 5 | samples/ 6 | *.json 7 | TODO* 8 | *.tsv 9 | 10 | !.flake8 11 | !.gitlab-ci.yml 12 | !.pylintrc 13 | !.coveragerc 14 | !.gitignore 15 | -------------------------------------------------------------------------------- /python/tests/test_html.py: -------------------------------------------------------------------------------- 1 | from sandcrawler.html import extract_fulltext_url 2 | 3 | 4 | def test_extract_fulltext_url(): 5 | 6 | resp = extract_fulltext_url("asdf", b"asdf") 7 | assert resp == {} 8 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseMimeCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import com.twitter.scalding.Args 4 | 5 | class HBaseMimeCountJob(args: Args) extends HBaseCountJob(args, "file:mime") {} 6 | 7 | -------------------------------------------------------------------------------- /extra/blobs/seaweedfs/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## HOWTO: Create new bucket in SeaweedFS 3 | 4 | Log in to the seaweedfs VM. 5 | 6 | Run `weed shell` to start a shell, then: 7 | 8 | bucket.create -name 9 | 10 | -------------------------------------------------------------------------------- /scalding/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 3 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 4 | -------------------------------------------------------------------------------- /python/sandcrawler/xml.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | 4 | def xml_reserialize(raw: bytes) -> str: 5 | root = ET.fromstring(raw) 6 | return '\n' + ET.tostring(root, encoding="unicode") 7 | -------------------------------------------------------------------------------- /notes/hbase_table_sizes.txt: -------------------------------------------------------------------------------- 1 | 2 | As of 2018-05-29: 3 | - qa rows: 1,246,013 4 | - prod rows: 8,974,188 5 | 6 | As of 2018-06-16: 7 | - qa: 1,246,013 8 | - prod: 18,308,086 9 | 10 | As of 2018-08-01: 11 | - qa: 1,246,013 12 | - prod: 18,308,141 13 | -------------------------------------------------------------------------------- /pig/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | 3 | url = "https://pypi.python.org/simple" 4 | verify_ssl = true 5 | name = "pypi" 6 | 7 | 8 | [dev-packages] 9 | 10 | 11 | 12 | [packages] 13 | 14 | pytest = "*" 15 | 16 | 17 | [requires] 18 | 19 | python_version = "3.5" 20 | -------------------------------------------------------------------------------- /pig/tests/test_filter_cdx.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | from pighelper import PigTestHelper 5 | 6 | class TestFilterCDX(PigTestHelper): 7 | 8 | def test_thing(self): 9 | self.run_pig("filter-cdx-ps.pig", "tests/files/example.cdx") 10 | -------------------------------------------------------------------------------- /notes/ingest/2019-10-23_testing.md: -------------------------------------------------------------------------------- 1 | 2 | exported not-archived DOIs for elife, as well as general list. 3 | 4 | wc -l recent\ missing\ oa\ releases.csv 5 | 161828 recent missing oa releases.csv 6 | 7 | wc -l missing\ elife\ DOIs.csv 8 | 1779 missing elife DOIs.csv 9 | -------------------------------------------------------------------------------- /python/example.env: -------------------------------------------------------------------------------- 1 | SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin" 2 | SANDCRAWLER_BLOB_SECRET_KEY="minioadmin" 3 | IA_ACCESS_KEY="dummy" 4 | IA_SECRET_KEY="dummy" 5 | CDX_AUTH_TOKEN="dummy" 6 | PETABOX_WEBDATA_SECRET="dummy" 7 | SENTRY_DSN="" 8 | SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/" 9 | -------------------------------------------------------------------------------- /sql/dump_file_meta.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT sha1hex, row_to_json(file_meta) 6 | FROM file_meta 7 | ORDER BY sha1hex ASC 8 | ) 9 | TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' 10 | WITH NULL ''; 11 | 12 | ROLLBACK; 13 | -------------------------------------------------------------------------------- /sql/migrations/2019-12-19-060141_init/down.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF NOT EXISTS cdx; 3 | DROP TABLE IF NOT EXISTS file_meta; 4 | DROP TABLE IF NOT EXISTS fatcat_file; 5 | DROP TABLE IF NOT EXISTS petabox; 6 | DROP TABLE IF NOT EXISTS grobid; 7 | DROP TABLE IF NOT EXISTS ingest_request; 8 | DROP TABLE IF NOT EXISTS shadow; 9 | -------------------------------------------------------------------------------- /python/tests/test_html_ingest.py: -------------------------------------------------------------------------------- 1 | from sandcrawler.ingest_html import * 2 | 3 | 4 | def test_html_extract_ojs3() -> None: 5 | 6 | with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f: 7 | ojs3_html = f.read() 8 | 9 | fulltext = html_extract_body_teixml(ojs3_html) 10 | assert fulltext["status"] == "success" 11 | -------------------------------------------------------------------------------- /kafka/monitoring_commands.md: -------------------------------------------------------------------------------- 1 | 2 | kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.status, .base_url]' -c 3 | 4 | kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.request.ingest_request_source, .status, .request.base_url, .terminal.terminal_url]' -c 5 | -------------------------------------------------------------------------------- /sql/migrations/00000000000000_diesel_initial_setup/down.sql: -------------------------------------------------------------------------------- 1 | -- This file was automatically created by Diesel to setup helper functions 2 | -- and other internal bookkeeping. This file is safe to edit, any future 3 | -- changes will be added to existing projects as new migrations. 4 | 5 | DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); 6 | DROP FUNCTION IF EXISTS diesel_set_updated_at(); 7 | -------------------------------------------------------------------------------- /notes/library_shopping.txt: -------------------------------------------------------------------------------- 1 | 2 | potential helpers: 3 | - https://github.com/martinblech/xmltodict 4 | - https://github.com/trananhkma/fucking-awesome-python#text-processing 5 | - https://github.com/blaze/blaze (for catalog/analytics) 6 | - validation: https://github.com/pyeve/cerberus 7 | - testing (to replace nose): 8 | - https://github.com/CleanCut/green 9 | - pytest 10 | - mamba ("behavior driven") 11 | -------------------------------------------------------------------------------- /proposals/brainstorm/2021-debug_web_interface.md: -------------------------------------------------------------------------------- 1 | 2 | status: brainstorm idea 3 | 4 | Simple internal-only web interface to help debug ingest issues. 5 | 6 | - paste a hash, URL, or identifier and get a display of "everything we know" about it 7 | - enter a URL/SURT prefix and get aggregate stats (?) 8 | - enter a domain/host/prefix and get recent attempts/results 9 | - pre-computed periodic reports on ingest pipeline (?) 10 | -------------------------------------------------------------------------------- /python/tests/test_ingest_html.py: -------------------------------------------------------------------------------- 1 | from sandcrawler.ingest_html import html_guess_platform 2 | 3 | from selectolax.parser import HTMLParser 4 | 5 | def test_html_guess_platform_no_icon_href() -> None: 6 | with open("tests/files/plos_one_article_no_icon_href.html", "r") as f: 7 | plos_html = f.read() 8 | parsed = HTMLParser(plos_html) 9 | result = html_guess_platform("", parsed) 10 | assert result == None 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *venv.zip 2 | mapreduce-*.tar.gz 3 | *,cover 4 | htmlcov/ 5 | python/venv-current.tar.gz 6 | *.test 7 | 8 | *.o 9 | *.a 10 | *.pyc 11 | #*# 12 | *~ 13 | *.swp 14 | .* 15 | *.tmp 16 | *.old 17 | *.profile 18 | *.bkp 19 | *.bak 20 | [Tt]humbs.db 21 | *.DS_Store 22 | build/ 23 | _build/ 24 | src/build/ 25 | *.log 26 | 27 | !.coveragerc 28 | !.gitlab-ci.yml 29 | !.pylintrc 30 | 31 | # Don't ignore this file itself 32 | !.gitignore 33 | -------------------------------------------------------------------------------- /scalding/src/main/scala/example/WordCountJob.scala: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import com.twitter.scalding._ 4 | 5 | class WordCountJob(args: Args) extends Job(args) { 6 | TypedPipe.from(TextLine(args("input"))) 7 | .flatMap { line => line.split("\\s+") } 8 | .map { word => (word, 1L) } 9 | .sumByKey 10 | // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink 11 | .write(TypedTsv[(String, Long)](args("output"))) 12 | } 13 | -------------------------------------------------------------------------------- /pig/tests/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.Target=System.out 4 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.stdout.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 6 | 7 | # With these enabled, get "log4j:ERROR Attempted to append to closed appender named [stdout]" 8 | #log4j.logger.org.apache.pig=WARN, stdout 9 | #log4j.logger.org.apache.hadoop = WARN, stdout 10 | -------------------------------------------------------------------------------- /notes/ingest/2020-05_pubmed.md: -------------------------------------------------------------------------------- 1 | 2 | From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1. 3 | 4 | Test small batch: 5 | 6 | zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 7 | 8 | Run the whole batch: 9 | 10 | zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 11 | -------------------------------------------------------------------------------- /python/tests/test_xml.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sandcrawler.xml import xml_reserialize 4 | 5 | 6 | def test_xml_reserialize() -> None: 7 | 8 | with open("tests/files/scielo_article.jats.xml", "rb") as f: 9 | raw_xml = f.read() 10 | 11 | assert b'encoding="ISO-8859-1"' in raw_xml 12 | raw_xml.decode("ISO-8859-1") 13 | with pytest.raises(UnicodeDecodeError): 14 | raw_xml.decode("utf-8") 15 | 16 | str_xml = xml_reserialize(raw_xml) 17 | assert 'encoding="UTF-8"' in str_xml 18 | -------------------------------------------------------------------------------- /pig/tests/files/papers_url_doi.cdx: -------------------------------------------------------------------------------- 1 | #http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1 2 | #http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607 3 | 4 | # should match 2: 5 | 6 | org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 7 | org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 8 | -------------------------------------------------------------------------------- /pig/tests/test_filter_software.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | from pighelper import PigTestHelper, count_lines 5 | 6 | 7 | class TestFilterCDXSoftware(PigTestHelper): 8 | 9 | def test_tarballs(self): 10 | r = self.run_pig("filter-cdx-tarball.pig", "tests/files/tarballs.cdx") 11 | assert count_lines(r) == 2 12 | 13 | def test_source_code(self): 14 | r = self.run_pig("filter-cdx-source-code-crude.pig", "tests/files/sourcecode.cdx") 15 | assert count_lines(r) == 1 16 | 17 | -------------------------------------------------------------------------------- /sql/dump_regrobid_pdf.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx 9 | WHERE cdx.mimetype = 'application/pdf' 10 | AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) 11 | ) 12 | TO STDOUT 13 | WITH NULL ''; 14 | 15 | ROLLBACK; 16 | -------------------------------------------------------------------------------- /python_hadoop/mrjob.conf: -------------------------------------------------------------------------------- 1 | runners: 2 | local: 3 | upload_files: 4 | - common.py 5 | - grobid2json.py 6 | setup: 7 | - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/ 8 | hadoop: 9 | no_output: true 10 | upload_files: 11 | - common.py 12 | - grobid2json.py 13 | setup: 14 | - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/ 15 | cmdenv: 16 | SENTRY_DSN: https://6ab6ad080d034280b863f294e07cc5c6:414ebf0b68634f669d2dc00d7c935699@books-sentry.us.archive.org/9 17 | -------------------------------------------------------------------------------- /extra/docker/README.md: -------------------------------------------------------------------------------- 1 | 2 | The docker-compose script in this directory may be helpful for local 3 | development. It starts Kafka, postgrest, and zookeeper. 4 | 5 | PostgreSQL is assumed to be running natively on localhost, not under docker. It 6 | should be possible to add postgresql to the docker-compose file, but some 7 | developers (bnewbold) prefer to run it separately to make things like attaching 8 | with `psql` easier. 9 | 10 | There is no current motivation or plan to deploy sandcrawler services using 11 | docker, so there is no Dockerfile for the system itself. 12 | -------------------------------------------------------------------------------- /sql/dump_unmatched_glutton_pdf.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < THING.sql > THING.2019-09-23.json 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT row_to_json(grobid) 9 | FROM grobid 10 | LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex 11 | WHERE fatcat_file.sha1hex IS NULL 12 | AND grobid.fatcat_release IS NOT NULL 13 | LIMIT 1000 14 | ) 15 | TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; 16 | --TO STDOUT 17 | --WITH NULL ''; 18 | 19 | ROLLBACK; 20 | -------------------------------------------------------------------------------- /extra/nginx/README.md: -------------------------------------------------------------------------------- 1 | 2 | This folder contains nginx configs for partner access to sandcrawler DB 3 | (postgrest) and GROBID XML blobs (minio). 4 | 5 | `fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to 6 | show how it works. 7 | 8 | ## Let's Encrypt 9 | 10 | As... bnewbold? 11 | 12 | sudo certbot certonly \ 13 | --non-interactive \ 14 | --agree-tos \ 15 | --email bnewbold@archive.org \ 16 | --webroot -w /var/www/letsencrypt \ 17 | -d sandcrawler-minio.fatcat.wiki \ 18 | -d sandcrawler-db.fatcat.wiki 19 | -------------------------------------------------------------------------------- /notes/ingest/2020-11-04_arxiv.md: -------------------------------------------------------------------------------- 1 | 2 | Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run 3 | a crawl. 4 | 5 | Crawl is now done, so going to ingest, hoping to get the majority of the 6 | millions of remaining arxiv.org PDFs. 7 | 8 | zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l 9 | => 1,288,559 10 | 11 | zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 12 | 13 | -------------------------------------------------------------------------------- /sql/dump_unextracted_pdf_petabox.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_unextracted_pdf_petabox.sql 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) 9 | FROM grobid 10 | LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex 11 | LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex 12 | WHERE petabox.sha1hex IS NOT NULL 13 | AND pdf_meta.sha1hex IS NULL 14 | ) 15 | TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' 16 | WITH NULL ''; 17 | 18 | ROLLBACK; 19 | -------------------------------------------------------------------------------- /python/.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | # TODO: should re-enable some of these 3 | disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck,unused-wildcard-import,no-member,cyclic-import,too-few-public-methods,wildcard-import,too-many-locals,too-many-ancestors,unused-import 4 | 5 | [REPORTS] 6 | output-format=colorized 7 | include-ids=yes 8 | 9 | [MISCELLANEOUS] 10 | # List of note tags to take in consideration, separated by a comma. 11 | notes=FIXME,XXX,DELETEME 12 | 13 | [TYPECHECK] 14 | extension-pkg-whitelist=selectolax,pydantic,responses 15 | -------------------------------------------------------------------------------- /python_hadoop/tests/test_grobid2json.py: -------------------------------------------------------------------------------- 1 | 2 | import xml 3 | import json 4 | import pytest 5 | from grobid2json import * 6 | 7 | 8 | def test_small_xml(): 9 | 10 | with open('tests/files/small.xml', 'r') as f: 11 | tei_xml = f.read() 12 | with open('tests/files/small.json', 'r') as f: 13 | json_form = json.loads(f.read()) 14 | 15 | assert teixml2json(tei_xml) == json_form 16 | 17 | def test_invalid_xml(): 18 | 19 | with pytest.raises(xml.etree.ElementTree.ParseError): 20 | teixml2json("this is not XML") 21 | with pytest.raises(ValueError): 22 | teixml2json("") 23 | -------------------------------------------------------------------------------- /extra/blobs/minio/minio.conf: -------------------------------------------------------------------------------- 1 | 2 | # Volume to be used for MinIO server. 3 | MINIO_VOLUMES="/sandcrawler-minio/data" 4 | # Use if you want to run MinIO on a custom port. 5 | MINIO_OPTS="--address :9000" 6 | # Access Key of the server. 7 | MINIO_ACCESS_KEY=REDACTED 8 | # Secret key of the server. 9 | MINIO_SECRET_KEY=REDACTED 10 | 11 | # may need to set these manually using `mc admin config get`, edit the JSON, then `set` 12 | MINIO_COMPRESS="on" 13 | MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv" 14 | MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream" 15 | -------------------------------------------------------------------------------- /sql/dump_regrobid_pdf_petabox.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_regrobid_pdf_petabox.sql 4 | -- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json 5 | 6 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 7 | 8 | COPY ( 9 | SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox 10 | WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) 11 | ) 12 | TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' 13 | WITH NULL ''; 14 | 15 | ROLLBACK; 16 | -------------------------------------------------------------------------------- /sql/reingest_spn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | set -o pipefail # fail if part of a '|' command fails 6 | 7 | sudo -u postgres psql sandcrawler < dump_reingest_spn.sql 8 | 9 | cd ../python 10 | sudo -u sandcrawler pipenv run \ 11 | ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \ 12 | > /srv/sandcrawler/tasks/reingest_spn.json 13 | 14 | cat /srv/sandcrawler/tasks/reingest_spn.json \ 15 | | shuf \ 16 | | head -n60000 \ 17 | | jq . -c \ 18 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 19 | 20 | -------------------------------------------------------------------------------- /python_hadoop/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "ia" 3 | url = "https://devpi.archive.org/wb/prod" 4 | verify_ssl = true 5 | 6 | [[source]] 7 | name = "pypi" 8 | url = "https://pypi.python.org/simple" 9 | verify_ssl = true 10 | 11 | [dev-packages] 12 | ipython = "*" 13 | happybase-mock = "*" 14 | pytest = "*" 15 | pytest-pythonpath = "*" 16 | responses = "*" 17 | pytest-cov = "*" 18 | pylint = "*" 19 | 20 | [packages] 21 | globalwayback = {version=">=0.3", index="ia"} 22 | happybase = "*" 23 | mrjob = "*" 24 | requests = "*" 25 | wayback = {version=">=0.2.1.2", index="ia"} 26 | xmltodict = "*" 27 | raven = "*" 28 | pykafka = "*" 29 | python-snappy = "*" 30 | boto3 = "*" 31 | 32 | [requires] 33 | python_version = "3.5" 34 | -------------------------------------------------------------------------------- /match_test_data/NOTES.txt: -------------------------------------------------------------------------------- 1 | 2 | Converted older .tsv from pdf-extraction comparison work with: 3 | 4 | cat 1k_random_identified_combined.tsv | jq -c --slurp --raw-input --raw-output 'split("\n") | .[:-1] | map(split("\t")) | map({"doi": .[0], "title": .[1], "authors": ( .[2] | split(";") ), "year": .[3], "journal": .[4], "publisher": .[5], "subject": .[6], "type": .[7], "sha": .[8]}) | .[]' > crossref_sample.bibjson 5 | 6 | Note that neither bibjson file is a superset of the either: 7 | 8 | 944 unique SHA1 which exist in both lists 9 | 964 in crossref_sample.sha1 10 | 979 in grobid_sample.sha1 11 | 12 | So scoring should be on a basis of "out of 944 lines". If this is confusing we 13 | can trim the files down. 14 | -------------------------------------------------------------------------------- /sql/backfill/petabox_transform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json, sys, os 4 | 5 | for l in sys.stdin.readlines(): 6 | l = l.strip() 7 | if not l: 8 | continue 9 | r = json.loads(l) 10 | if not r['sha1']: 11 | continue 12 | sha1hex = r['sha1'] 13 | for url in r['urls']: 14 | u = url['url'] 15 | if not '//archive.org/' in u: 16 | continue 17 | u = u.split('/') 18 | if u[2] == 'web.archive.org': 19 | continue 20 | #print(u) 21 | assert u[2] == 'archive.org' and u[3] in ('download', 'serve') 22 | item = u[4] 23 | path = '/'.join(u[5:]) 24 | print("\t".join([item, path, sha1hex])) 25 | -------------------------------------------------------------------------------- /sql/dump_ungrobid_pdf_petabox.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_ungrobid_pdf_petabox.sql 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) 9 | FROM petabox 10 | WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) 11 | -- uncomment/comment this to control whether only fatcat files are included 12 | AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) 13 | ) 14 | TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' 15 | WITH NULL ''; 16 | 17 | ROLLBACK; 18 | -------------------------------------------------------------------------------- /sql/dump_ungrobid_pdf.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_ungrobid_pdf.sql 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) 9 | FROM cdx 10 | WHERE cdx.mimetype = 'application/pdf' 11 | AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) 12 | -- uncomment/comment this to control whether only fatcat files are included 13 | --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) 14 | ) 15 | TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' 16 | WITH NULL ''; 17 | 18 | ROLLBACK; 19 | -------------------------------------------------------------------------------- /sql/reingest_old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | set -o pipefail # fail if part of a '|' command fails 6 | 7 | sudo -u postgres psql sandcrawler < dump_reingest_old.sql 8 | 9 | cd ../python 10 | sudo -u sandcrawler pipenv run \ 11 | ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \ 12 | > /srv/sandcrawler/tasks/reingest_old_current.json 13 | 14 | cat /srv/sandcrawler/tasks/reingest_old_current.json \ 15 | | shuf \ 16 | | head -n250000 \ 17 | | jq . -c \ 18 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 19 | 20 | -------------------------------------------------------------------------------- /pig/hbase-count-rows.pig: -------------------------------------------------------------------------------- 1 | 2 | REGISTER /usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar 3 | REGISTER /usr/lib/hbase/lib/hbase-common-0.98.6-cdh5.3.1.jar 4 | REGISTER /usr/lib/hbase/lib/hbase-hadoop2-compat-0.98.6-cdh5.3.1.jar 5 | REGISTER /usr/lib/hbase/lib/hbase-protocol-0.98.6-cdh5.3.1.jar 6 | 7 | set hbase.zookeeper.quorum 'mtrcs-zk1.us.archive.org,mtrcs-zk2.us.archive.org,mtrcs-zk3.us.archive.org' 8 | 9 | data = LOAD 'hbase://wbgrp-journal-extract-0-qa' 10 | USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('grobid0:status_code', '-loadKey true') 11 | AS (key:CHARARRAY, status:CHARARRAY); 12 | 13 | data_group = GROUP data ALL; 14 | data_count = FOREACH data_group GENERATE COUNT(data); 15 | DUMP data_count; 16 | -------------------------------------------------------------------------------- /proposals/2021-09-21_spn_accounts.md: -------------------------------------------------------------------------------- 1 | 2 | Formalization of SPNv2 API requests from fatcat/sandcrawler 3 | 4 | Create two new system accounts, one for regular/daily ingest requests, one for 5 | priority requests (save-paper-now or as a flag with things like fatcat-ingest; 6 | "interactive"). These accounts should have @archive.org emails. Request the 7 | daily one to have the current rate limit as bnewbold@archive.org account; the 8 | priority queue can have less. 9 | 10 | Create new ingest kafka queues from scratch, one for priority and one for 11 | regular. Chose sizes carefully, probably keep 24x for the regular and do 6x or 12 | so (small) for priority queue. 13 | 14 | Deploy new priority workers; reconfigure/deploy broadly. 15 | -------------------------------------------------------------------------------- /sql/reingest_bulk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | set -o pipefail # fail if part of a '|' command fails 6 | 7 | sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql 8 | 9 | cd ../python 10 | sudo -u sandcrawler pipenv run \ 11 | ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \ 12 | > /srv/sandcrawler/tasks/reingest_bulk_current.json 13 | 14 | cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ 15 | | shuf \ 16 | | head -n1000000 \ 17 | | jq . -c \ 18 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 19 | 20 | -------------------------------------------------------------------------------- /notes/dryad_datasets.md: -------------------------------------------------------------------------------- 1 | 2 | api docs: https://datadryad.org/api/v2/docs 3 | 4 | current search queries return 38,000 hits (December 2020) 5 | 6 | exmaple with multiple versions: 7 | https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0 8 | https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0 9 | https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions 10 | 11 | 12 | how to handle versions? DOI doesn't get incremented. 13 | 14 | on archive.org, could have separate item for each version, or sub-directories within item, one for each version 15 | 16 | in fatcat, could have a release for each version, but only one with 17 | the DOI; or could have a separate fileset for each version 18 | -------------------------------------------------------------------------------- /match_test_data/RESULTS.txt: -------------------------------------------------------------------------------- 1 | 2 | "Out of 944 lines"... 3 | 4 | ## Git 92584ec4201ecc27af423cbff7b4bc1573edf175 5 | 6 | 76.27% match. 7 | 8 | time ./please --qa match-benchmark match_test_data/crossref_sample.bibjson match_test_data/grobid_sample.bibjson out.test 9 | 10 | real 0m56.061s 11 | user 1m3.852s 12 | sys 0m3.924s 13 | 14 | 720 lines 15 | 720 uniq DOI 16 | 720 uniq SHA1 17 | 18 | ## Git aa2f905d65713a581c7630ef2f931045059200ef 19 | 20 | real 0m56.347s 21 | user 1m3.328s 22 | sys 0m4.000s 23 | 24 | bnewbold@orithena$ wc -l out.test 25 | 722 out.test 26 | bnewbold@orithena$ cut -f3 out.test | jq .doi -r | sort -u | wc -l 27 | 722 28 | bnewbold@orithena$ cut -f4 out.test | jq .sha1 -r | sort -u | wc -l 29 | 722 30 | 31 | -------------------------------------------------------------------------------- /sql/reingest_terminalstatus_forcerecrawl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | set -o pipefail # fail if part of a '|' command fails 6 | 7 | sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql 8 | 9 | cd ../python 10 | sudo -u sandcrawler pipenv run \ 11 | ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \ 12 | > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json 13 | 14 | cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \ 15 | | shuf \ 16 | | head -n100000 \ 17 | | jq . -c \ 18 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 19 | 20 | -------------------------------------------------------------------------------- /notes/examples/random_datasets.md: -------------------------------------------------------------------------------- 1 | 2 | Possible external datasets to ingest (which are not entire platforms): 3 | 4 | - https://research.google/tools/datasets/ 5 | - https://openslr.org/index.html 6 | - https://www.kaggle.com/datasets?sort=votes&tasks=true 7 | - https://archive.ics.uci.edu/ml/datasets.php 8 | 9 | Existing archive.org datasets to ingest: 10 | 11 | - https://archive.org/details/allthemusicllc-datasets 12 | 13 | Papers on archive.org to ingest: 14 | 15 | - 16 | - 17 | - 18 | - 19 | - 20 | -------------------------------------------------------------------------------- /notes/backfill_scalding_rewrite.txt: -------------------------------------------------------------------------------- 1 | 2 | Background context needed: 3 | - CDX text file format 4 | - rough arch outline (what runs where) 5 | - basic hadoop+hbase overview 6 | - hbase schema 7 | - quick look at hadoop and hbase web interfaces 8 | - maybe quick re-profile? 9 | 10 | Plan/Steps: 11 | x together: get *any* JVM map/reduce thing to build and run on cluster 12 | x together: get something to build that talks to hbase 13 | x basic JVM test infra; HBase mockup. "shopping" 14 | => scalding and/or cascading 15 | x simple hbase scan report generation (counts/stats) 16 | x CDX parsing 17 | - complete backfill script 18 | 19 | Spec for CDX backfill script: 20 | - input is CDX, output to HBase table 21 | - filter input before anything ("defensive"; only PDF, HTTP 200, size limit) 22 | - reads HBase before insert; don't overwrite 23 | -------------------------------------------------------------------------------- /sql/dump_unextracted_pdf.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Run like: 3 | -- psql sandcrawler < dump_unextracted_pdf.sql 4 | 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 6 | 7 | COPY ( 8 | SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) 9 | FROM grobid 10 | LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex 11 | --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex 12 | LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex 13 | LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex 14 | WHERE cdx.sha1hex IS NOT NULL 15 | --AND fatcat_file.sha1hex IS NOT NULL 16 | AND ingest_file_result.terminal_sha1hex IS NOT NULL 17 | AND pdf_meta.sha1hex IS NULL 18 | ) 19 | TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' 20 | WITH NULL ''; 21 | 22 | ROLLBACK; 23 | -------------------------------------------------------------------------------- /python/tests/test_grobid2json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import xml 3 | 4 | import pytest 5 | from grobid_tei_xml import parse_document_xml 6 | 7 | 8 | def test_small_xml(): 9 | """ 10 | This used to be a test of grobid2json; now it is a compatability test for 11 | the to_legacy_dict() feature of grobid_tei_xml. 12 | """ 13 | 14 | with open("tests/files/small.xml", "r") as f: 15 | tei_xml = f.read() 16 | with open("tests/files/small.json", "r") as f: 17 | json_form = json.loads(f.read()) 18 | 19 | tei_doc = parse_document_xml(tei_xml) 20 | assert tei_doc.to_legacy_dict() == json_form 21 | 22 | 23 | def test_invalid_xml(): 24 | 25 | with pytest.raises(xml.etree.ElementTree.ParseError): 26 | parse_document_xml("this is not XML") 27 | with pytest.raises(ValueError): 28 | parse_document_xml("") 29 | -------------------------------------------------------------------------------- /notes/ingest/2020-09_scielo.md: -------------------------------------------------------------------------------- 1 | 2 | As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing 3 | fatcat releases with no IA copy and with `publisher_type:scielo`. There are 4 | 200k+ such releases. 5 | 6 | It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008 7 | 8 | Could try XML ingest of these! 9 | 10 | ## Bulk Ingest 11 | 12 | Dump ingest requests 13 | 14 | ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json 15 | Expecting 212529 release objects in search queries 16 | 17 | Enqueue 18 | 19 | cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 20 | => done 2020-09-14 21 | 22 | -------------------------------------------------------------------------------- /notes/crawl_cdx_merge.md: -------------------------------------------------------------------------------- 1 | 2 | ## New Way 3 | 4 | Run script from scratch repo: 5 | 6 | ~/scratch/bin/cdx_collection.py CRAWL-2000 7 | 8 | zcat CRAWL-2000.cdx.gz | wc -l 9 | 10 | # update crawl README/ANALYSIS/whatever 11 | 12 | Assuming we're just looking at PDFs: 13 | 14 | zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u > CRAWL-2000.sorted.cdx 15 | 16 | ## Old Way 17 | 18 | Use metamgr to export an items list. 19 | 20 | Get all the CDX files and merge/sort: 21 | 22 | mkdir CRAWL-2000 && cd CRAWL-2000 23 | cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz 24 | ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx 25 | sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx 26 | wc -l CRAWL-2000.cdx 27 | rm CRAWL-2000.unsorted.cdx 28 | 29 | # gzip and upload to petabox, or send to HDFS, or whatever 30 | -------------------------------------------------------------------------------- /sql/table_sizes.md: -------------------------------------------------------------------------------- 1 | 2 | ## September 2019 3 | 4 | table_name | table_size | indexes_size | total_size 5 | --------------------------------------------------------------+------------+--------------+------------ 6 | "public"."cdx" | 31 GB | 27 GB | 58 GB 7 | "public"."file_meta" | 13 GB | 6500 MB | 19 GB 8 | "public"."shadow" | 8303 MB | 9216 MB | 17 GB 9 | "public"."grobid" | 4994 MB | 6678 MB | 11 GB 10 | "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB 11 | "public"."petabox" | 403 MB | 594 MB | 997 MB 12 | -------------------------------------------------------------------------------- /pig/tests/test_filter_cdx_paper_pdfs.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | from pighelper import PigTestHelper, count_lines 5 | 6 | 7 | class TestFilterCDXPaperPdfs(PigTestHelper): 8 | 9 | def test_papers_domain_words(self): 10 | r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx") 11 | assert count_lines(r) == 4 12 | 13 | def test_papers_edu_tilde(self): 14 | r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx") 15 | assert count_lines(r) == 6 16 | 17 | def test_papers_url_doi(self): 18 | r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx") 19 | assert count_lines(r) == 2 20 | 21 | def test_papers_url_words(self): 22 | r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx") 23 | assert count_lines(r) == 12 24 | 25 | -------------------------------------------------------------------------------- /python/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = C,E,F,W,ANN 3 | # ANN003 is annotation on, eg, **kwargs 4 | # ANN101 is annotation on 'self' (why would that be wanted?) 5 | # ANN204 is annotation on '__init__()' 6 | # ANN401 is 'Any' type 7 | # E265,E266 are restrictions on comments ('#') 8 | # E501 is line-too-long, which we enforce with black 9 | # W503,E203 are allowed by black 10 | # TODO: C901 is complexity, should be re-enabled at some point 11 | ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203 12 | per-file-ignores = 13 | sandcrawler/__init__.py: F401 14 | sandcrawler/ia.py: E402 15 | tests/*.py: ANN201,ANN001,F403,F405 16 | # TODO: add more annotations to CLI scripts 17 | *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405 18 | scripts: ANN201,ANN001,ANN202,ANN206,ANN205 19 | exclude = .git,__pycache__,.venv,scripts/ 20 | max-line-length = 96 21 | max-complexity = 30 22 | -------------------------------------------------------------------------------- /notes/possible_ingest_targets.txt: -------------------------------------------------------------------------------- 1 | 2 | - all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5 3 | 4 | more complex crawling/content: 5 | - add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 6 | - watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url 7 | - www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) 8 | - doi.ala.org.au: possible dataset ingest source 9 | - peerj.com, at least reviews, should be HTML ingest? or are some PDF? 10 | - publons.com should be HTML ingest, possibly special case for scope 11 | - frontiersin.org: any 'component' releases with PDF file are probably a metadata bug 12 | 13 | other tasks: 14 | - handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 15 | - push/deploy sandcrawler changes 16 | -------------------------------------------------------------------------------- /extra/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | zookeeper: 4 | image: wurstmeister/zookeeper 5 | ports: 6 | - "2181:2181" 7 | kafka: 8 | image: wurstmeister/kafka:2.11-2.0.0 9 | ports: 10 | - "9092:9092" 11 | environment: 12 | #HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2" 13 | KAFKA_BROKER_ID: 1 14 | KAFKA_ADVERTISED_HOST_NAME: 127.0.0.1 15 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 16 | KAFKA_CREATE_TOPICS: "fatcat-dev.changelog:1:1,fatcat-dev.release-updates:3:1:compact" 17 | KAFKA_MESSAGE_MAX_BYTES: 50000000 18 | volumes: 19 | - /var/run/docker.sock:/var/run/docker.sock 20 | depends_on: 21 | - zookeeper 22 | postgrest: 23 | image: postgrest/postgrest 24 | network_mode: "host" 25 | ports: 26 | - "3000:3000" 27 | environment: 28 | PGRST_DB_URI: "postgres://fatcat:tactaf@localhost/sandcrawler" 29 | PGRST_DB_ANON_ROLE: "fatcat" 30 | -------------------------------------------------------------------------------- /notes/tasks/2022-01-07_grobid_platform_pdfs.md: -------------------------------------------------------------------------------- 1 | 2 | Martin crawled more than 10 million new PDFs from various platform domains. We 3 | should get these processed and included in sandcrawler-db. 4 | 5 | ## Select CDX Rows 6 | 7 | COPY ( 8 | SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) 9 | FROM cdx 10 | LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex 11 | WHERE 12 | grobid.sha1hex IS NULL 13 | AND cdx.sha1hex IS NOT NULL 14 | AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' 15 | -- LIMIT 5; 16 | ) 17 | TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' 18 | WITH NULL ''; 19 | => COPY 8801527 20 | 21 | cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 22 | 23 | # for pdfextract, would be: sandcrawler-prod.unextracted 24 | -------------------------------------------------------------------------------- /pig/filter-cdx-pdfs.pig: -------------------------------------------------------------------------------- 1 | 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype. 3 | -- 4 | -- Author: Bryan Newbold 5 | -- Date: May 2018 6 | 7 | %default INPUT '' 8 | %default OUTPUT '' 9 | 10 | set mapreduce.job.queuename default 11 | 12 | cdx = LOAD '$INPUT' AS cdxline:chararray; 13 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 14 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 15 | 16 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 17 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; 18 | cdx = FILTER cdx BY not url matches '-'; 19 | cdx = FILTER cdx BY httpstatus matches '200'; 20 | cdx = FILTER cdx BY mimetype matches '.*pdf.*'; 21 | cdx = ORDER cdx by url, timestamp PARALLEL 50; 22 | cdx = FOREACH cdx GENERATE cdxline; 23 | STORE cdx INTO '$OUTPUT' USING PigStorage(' '); 24 | 25 | -------------------------------------------------------------------------------- /pig/filter-cdx-ps.pig: -------------------------------------------------------------------------------- 1 | -- Tries to filter down a large CDX file (GWB index) to a subset of postscript 2 | -- files, by mimetype. 3 | -- 4 | -- Author: Bryan Newbold 5 | -- Date: May 2018 6 | 7 | %default INPUT '' 8 | %default OUTPUT '' 9 | 10 | set mapreduce.job.queuename default 11 | 12 | cdx = LOAD '$INPUT' AS cdxline:chararray; 13 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 14 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 15 | 16 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 17 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; 18 | cdx = FILTER cdx BY not url matches '-'; 19 | cdx = FILTER cdx BY httpstatus matches '200'; 20 | cdx = FILTER cdx BY mimetype matches '.*postscript.*'; 21 | cdx = ORDER cdx by url, timestamp PARALLEL 50; 22 | cdx = FOREACH cdx GENERATE cdxline; 23 | STORE cdx INTO '$OUTPUT' USING PigStorage(' '); 24 | 25 | -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL = /bin/bash 3 | .SHELLFLAGS = -o pipefail -c 4 | 5 | .PHONY: help 6 | help: ## Print info about all commands 7 | @echo "Commands:" 8 | @echo 9 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' 10 | 11 | .PHONY: deps 12 | deps: ## Install dependencies using pipenv 13 | pipenv install --dev 14 | 15 | .PHONY: lint 16 | lint: ## Run lints (eg, flake8, mypy) 17 | pipenv run flake8 . --exit-zero 18 | pipenv run isort -q -c . || true 19 | pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports 20 | 21 | .PHONY: fmt 22 | fmt: ## Run code formating on all source code 23 | pipenv run isort --atomic . 24 | pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py 25 | 26 | .PHONY: test 27 | test: ## Run all tests and lints 28 | pipenv run pytest 29 | 30 | .PHONY: coverage 31 | coverage: ## Run all tests with coverage 32 | pipenv run pytest --cov --cov-report=term --cov-report=html 33 | -------------------------------------------------------------------------------- /Dockerfile.sandcrawler-pytest: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | FROM ubuntu:focal 4 | WORKDIR /src 5 | COPY . . 6 | ENV LC_ALL=C.UTF-8 7 | ENV LANG=C.UTF-8 8 | ENV DEBIAN_FRONTEND=noninteractive 9 | # copied and modified from gitlab ci yml file 10 | RUN apt update && apt install -y python3-dev python3-pip python3-wheel libjpeg-dev libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget && pip install pipenv pytest 11 | #RUN git config --global --add safe.directory /src 12 | WORKDIR python 13 | RUN make deps 14 | CMD make test 15 | 16 | # Build 17 | # NB: requires sshuttle or similar bc build process talks to devpi.us.archive.org 18 | # docker build --network=host -t sandcrawler-pytest -f Dockerfile.sandcrawler-pytest . 19 | 20 | # Run, adjusting source path as needed 21 | # docker run --network host -v/home/vilmibm/src/sandcrawler:/src sandcrawler-pytest 22 | -------------------------------------------------------------------------------- /python/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | 3 | # allow imports from files in current directory 4 | python_paths = . 5 | 6 | # search for 'test_*' functions in all python files, not just under tests 7 | python_files = *.py 8 | 9 | addopts = --pylint --pylint-rcfile=.pylintrc --pylint-error-types=EF --pylint-jobs=4 10 | 11 | # ignore various third party warnings (in .venv) 12 | filterwarnings = 13 | ignore:.*common_exception_handling.*StopIteration:PendingDeprecationWarning 14 | ignore:.*deprecated and will be removed in Werkzeug 1.0.*:DeprecationWarning 15 | ignore::DeprecationWarning:.*surt 16 | ignore::DeprecationWarning:.*urllib3 17 | ignore::DeprecationWarning:.*wayback 18 | ignore::DeprecationWarning:.*PIL 19 | ignore::DeprecationWarning:.*justext 20 | ignore::DeprecationWarning:.*internetarchive 21 | ignore::DeprecationWarning:.*minio 22 | ignore::DeprecationWarning:.*base_reporter 23 | ignore::DeprecationWarning:.*loccache 24 | ignore:.*pytz-deprecation-shim 25 | 26 | log_level = INFO 27 | -------------------------------------------------------------------------------- /python/tests/test_pushers.py: -------------------------------------------------------------------------------- 1 | from sandcrawler.workers import BlackholeSink, CdxLinePusher 2 | 3 | 4 | def test_cdx_line_pusher(): 5 | 6 | sink = BlackholeSink() 7 | 8 | # vanilla (only default filters) 9 | with open("tests/files/example.cdx", "r") as cdx_file: 10 | pusher = CdxLinePusher(sink, cdx_file) 11 | counts = pusher.run() 12 | assert counts["total"] == 20 13 | assert counts["skip-parse"] == 1 14 | assert counts["pushed"] == 19 15 | 16 | # HTTP 200 and application/pdf 17 | with open("tests/files/example.cdx", "r") as cdx_file: 18 | pusher = CdxLinePusher( 19 | sink, 20 | cdx_file, 21 | filter_mimetypes=["application/pdf"], 22 | filter_http_statuses=[200, 226], 23 | ) 24 | counts = pusher.run() 25 | assert counts["total"] == 20 26 | assert counts["skip-parse"] == 1 27 | assert counts["skip-http_status"] == 10 28 | assert counts["skip-mimetype"] == 2 29 | assert counts["pushed"] == 7 30 | -------------------------------------------------------------------------------- /sql/reingest_weekly.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | # can't use pipefail here because under normal operations kafkacat will exit 6 | # code with a 141 (indicating that a pipe ran out of stuff for it to read). 7 | # this will always trigger this file to report failure and thus lead to 8 | # perpetually failing this when used in a systemd service. 9 | #set -o pipefail # fail if part of a '|' command fails 10 | 11 | sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql 12 | 13 | cd ../python 14 | sudo -u sandcrawler pipenv run \ 15 | ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \ 16 | > /srv/sandcrawler/tasks/reingest_weekly_current.json 17 | 18 | cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ 19 | | shuf \ 20 | | head -n80000 \ 21 | | jq . -c \ 22 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 23 | 24 | -------------------------------------------------------------------------------- /sql/reingest_quarterly.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # fail on error 4 | set -u # fail if variable not set in substitution 5 | # can't use pipefail here because under normal operations kafkacat will exit 6 | # code with a 141 (indicating that a pipe ran out of stuff for it to read). 7 | # this will always trigger this file to report failure and thus lead to 8 | # perpetually failing this when used in a systemd service. 9 | #set -o pipefail # fail if part of a '|' command fails 10 | 11 | sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql 12 | 13 | cd ../python 14 | sudo -u sandcrawler pipenv run \ 15 | ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \ 16 | > /srv/sandcrawler/tasks/reingest_quarterly_current.json 17 | 18 | cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ 19 | | shuf \ 20 | | head -n120000 \ 21 | | jq . -c \ 22 | | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 23 | 24 | -------------------------------------------------------------------------------- /pig/tests/files/tarballs.cdx: -------------------------------------------------------------------------------- 1 | #http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 2 | #http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf 3 | #http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf 4 | #http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf 5 | 6 | # should match 2: 7 | 8 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz 9 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/octet-stream 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz 10 | org,sgmjournals,ijs)//cgi/reprint/54/6/2217.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/gzip 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 11 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import parallelai.spyglass.base.JobBase 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 10 | import parallelai.spyglass.hbase.HBasePipeConversions 11 | import parallelai.spyglass.hbase.HBaseSource 12 | 13 | class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversions { 14 | 15 | val output = args("output") 16 | 17 | HBaseRowCountJob.getHBaseSource( 18 | args("hbase-table"), 19 | args("zookeeper-hosts")) 20 | .read 21 | .debug 22 | .groupAll { _.size('count) } 23 | .write(Tsv(output)) 24 | } 25 | 26 | object HBaseRowCountJob { 27 | 28 | // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181" 29 | def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = { 30 | HBaseBuilder.build( 31 | hbaseTable, 32 | zookeeperHosts, 33 | List("f:c"), 34 | SourceMode.SCAN_ALL) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /python/scripts/pdf_thumbnail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc). 4 | 5 | Originally used to benchmark and compare file size/quality. 6 | """ 7 | 8 | import sys 9 | 10 | import poppler 11 | from PIL import Image 12 | 13 | 14 | def run(inpath, outpath): 15 | 16 | try: 17 | pdf = poppler.load_from_file(inpath) 18 | page = pdf.create_page(0) 19 | except Exception as e: 20 | print(str(e), file=sys.stderr) 21 | sys.exit(0) 22 | 23 | renderer = poppler.PageRenderer() 24 | full_page = renderer.render_page(page) 25 | img = Image.frombuffer( 26 | "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1 27 | ) 28 | img.thumbnail((180, 300), Image.BICUBIC) 29 | # img.thumbnail((360,600), Image.BICUBIC) 30 | img.save(outpath) 31 | # img.save(outpath, quality=95) 32 | 33 | 34 | if __name__ == "__main__": 35 | if len(sys.argv) != 3: 36 | print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr) 37 | sys.exit(-1) 38 | run(sys.argv[1], sys.argv[2]) 39 | -------------------------------------------------------------------------------- /sql/stats/2021-11-01_table_sizes.txt: -------------------------------------------------------------------------------- 1 | 2 | Size: 832.66G 3 | 4 | table_name | table_size | indexes_size | total_size 5 | -------------------------------+------------+--------------+------------ 6 | "public"."crossref" | 311 GB | 9812 MB | 320 GB 7 | "public"."ingest_request" | 44 GB | 40 GB | 84 GB 8 | "public"."cdx" | 52 GB | 28 GB | 80 GB 9 | "public"."grobid" | 72 GB | 6952 MB | 79 GB 10 | "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB 11 | "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB 12 | "public"."file_meta" | 34 GB | 21 GB | 54 GB 13 | "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB 14 | "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB 15 | "public"."shadow" | 9517 MB | 8026 MB | 17 GB 16 | "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB 17 | "public"."petabox" | 403 MB | 461 MB | 864 MB 18 | "public"."pdftrio" | 550 MB | 297 MB | 847 MB 19 | (13 rows) 20 | -------------------------------------------------------------------------------- /notes/ingest/es_csv_to_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | input like: 5 | 6 | doi,ident,"release_stage" 7 | "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published 8 | "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published 9 | "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published 10 | "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published 11 | "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published 12 | 13 | output like: 14 | 15 | { 16 | "base_url": "https://doi.org/10.7554/elife.38904", 17 | "ext_ids": { 18 | "doi": "10.7554/elife.38904" 19 | }, 20 | "fatcat_release": "mxj534diw5gatc26rkif3io5xm", 21 | "release_stage": "published" 22 | } 23 | """ 24 | 25 | import csv, sys, json 26 | 27 | reader = csv.DictReader(sys.stdin) 28 | for row in reader: 29 | d = { 30 | "base_url": "https://doi.org/{}".format(row['doi']), 31 | "ext_ids": { 32 | "doi": row['doi'], 33 | }, 34 | "fatcat_release": row['ident'], 35 | "release_stage": row['release_stage'], 36 | } 37 | print(json.dumps(d)) 38 | -------------------------------------------------------------------------------- /notes/ingest/2020-03-oa_but_not_marked.md: -------------------------------------------------------------------------------- 1 | 2 | These are large journals with a high fraction of "in IA", but not marked as OA 3 | so not crawling regularly. 4 | 5 | TODO: add things like list of unpaywall ISSN / OA status to try and find more 6 | "practical" / bronze OA 7 | 8 | ## First Run 9 | 10 | https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him 11 | https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4 12 | https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4 13 | https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e 14 | https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm 15 | https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe 16 | 17 | ## TODO 18 | 19 | https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible) 20 | https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?) 21 | 22 | https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link? 23 | https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA? 24 | https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken? 25 | https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop 26 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import parallelai.spyglass.base.JobBase 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 10 | import parallelai.spyglass.hbase.HBasePipeConversions 11 | import parallelai.spyglass.hbase.HBaseSource 12 | 13 | class HBaseColCountJob(args: Args) extends JobBase(args) with HBasePipeConversions { 14 | 15 | val output = args("output") 16 | 17 | HBaseColCountJob.getHBaseSource( 18 | args("hbase-table"), 19 | args("zookeeper-hosts"), 20 | args("column")) 21 | .read 22 | .debug 23 | .groupAll { _.size('count) } 24 | .write(Tsv(output)) 25 | } 26 | 27 | object HBaseColCountJob { 28 | 29 | // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181" 30 | def getHBaseSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = { 31 | HBaseBuilder.build( 32 | hbaseTable, 33 | zookeeperHosts, 34 | List(col), 35 | SourceMode.SCAN_ALL) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pig/tests/files/sourcecode.cdx: -------------------------------------------------------------------------------- 1 | # match 2 | edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.java 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.java text/plain 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz 3 | # no 4 | fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pyc 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pyc text/plain 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz 5 | # no 6 | org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz 7 | -------------------------------------------------------------------------------- /pig/tests/files/papers_domain_words.cdx: -------------------------------------------------------------------------------- 1 | #http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 2 | #http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf 3 | #http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf 4 | #http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf 5 | 6 | # should match 4: 7 | 8 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz 9 | org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 10 | uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 11 | au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 QQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 12 | -------------------------------------------------------------------------------- /python/scripts/enrich_scored_matches.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Takes an "joined" TSV input stream: 4 | 5 | - sha1 6 | - dois (JSON list) 7 | - cdx (JSON object) 8 | - url 9 | - dt 10 | (etc) 11 | - mimetype 12 | - size (integer) 13 | 14 | And outputs JSON objects that are can be imported into fatcat with the 15 | "matched" script. 16 | 17 | No dependencies (only python3 stdlib) 18 | """ 19 | 20 | import base64 21 | import json 22 | import sys 23 | 24 | 25 | def run(): 26 | for line in sys.stdin: 27 | line = line.split("\t") 28 | assert len(line) == 5 29 | raw_sha1 = line[0].replace("sha1:", "") 30 | dois = json.loads(line[1]) 31 | cdx = json.loads(line[2]) 32 | mimetype = line[3] 33 | size = int(line[4]) 34 | 35 | sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower() 36 | 37 | obj = dict( 38 | sha1=sha1, 39 | dois=dois, 40 | cdx=[dict(url=cdx["url"], dt=cdx["dt"])], 41 | size=size, 42 | mimetype=mimetype, 43 | ) 44 | print(json.dumps(obj)) 45 | 46 | 47 | if __name__ == "__main__": 48 | run() 49 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | class HBaseStatusCountJob(args: Args) extends JobBase(args) with HBasePipeConversions { 17 | 18 | val source = HBaseCountJob.getHBaseSource( 19 | args("hbase-table"), 20 | args("zookeeper-hosts"), 21 | "grobid0:status") 22 | 23 | val statusPipe : TypedPipe[String] = source 24 | .read 25 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status) 26 | .map { case (key, raw_status) => Bytes.toString(raw_status.copyBytes()) } 27 | 28 | statusPipe.groupBy { identity } 29 | .size 30 | .debug 31 | .write(TypedTsv[(String,Long)](args("output"))) 32 | } 33 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | class HBaseStatusCodeCountJob(args: Args) extends JobBase(args) with HBasePipeConversions { 17 | 18 | val source = HBaseCountJob.getHBaseSource( 19 | args("hbase-table"), 20 | args("zookeeper-hosts"), 21 | "grobid0:status_code") 22 | 23 | val statusPipe : TypedPipe[Long] = source 24 | .read 25 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code) 26 | .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) } 27 | 28 | statusPipe.groupBy { identity } 29 | .size 30 | .debug 31 | .write(TypedTsv[(Long,Long)](args("output"))) 32 | } 33 | -------------------------------------------------------------------------------- /pig/README.md: -------------------------------------------------------------------------------- 1 | 2 | As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera 3 | Distribution). 4 | 5 | "Local mode" unit tests in this folder run with Pig version 0.17.0 (controlled 6 | by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of 7 | 0.12.0. 8 | 9 | [pig-bug]: https://issues.apache.org/jira/browse/PIG-3530 10 | 11 | ## Development and Testing 12 | 13 | To run tests, you need Java installed and `JAVA_HOME` configured. 14 | 15 | Fetch dependencies (including pig) from top-level directory: 16 | 17 | ./fetch_hadoop.sh 18 | 19 | Write `.pig` scripts in this directory, and add a python wrapper test to 20 | `./tests/` when done. Test vector files (input/output) can go in 21 | `./tests/files/`. 22 | 23 | Run the tests with: 24 | 25 | pipenv run pytest 26 | 27 | Could also, in theory, use a docker image ([local-pig][]), but it's pretty easy 28 | to just download. 29 | 30 | [local-pig]: https://hub.docker.com/r/chalimartines/local-pig 31 | 32 | ## Run in Production 33 | 34 | pig -param INPUT="/user/bnewbold/pdfs/global-20171227034923" \ 35 | -param OUTPUT="/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter" \ 36 | filter-cdx-paper-pdfs.pig 37 | -------------------------------------------------------------------------------- /notes/ingest/2023-10_dimensions.md: -------------------------------------------------------------------------------- 1 | # Dimensions OA list 2 | 3 | In 09/2023 dimensions.ai handed over a list of 11667892 DOI and URL, 1613390 we 4 | found an exact match for in the fatcat file entity data. 11040477 URLs we could 5 | lookup successfully via CDX index and found that 2526822 URLs were not in GWB 6 | as of 2023-10-16. 7 | 8 | Top 20 domains: 9 | 10 | ``` 11 | 613732 doi.org 12 | 150592 europepmc.org 13 | 98725 academic.oup.com 14 | 47932 journals.lww.com 15 | 46191 www.ncbi.nlm.nih.gov 16 | 31808 www.biodiversitylibrary.org 17 | 30290 arxiv.org 18 | 28737 zenodo.org 19 | 28375 hdl.handle.net 20 | 26226 pubs.aip.org 21 | 25667 dergipark.org.tr 22 | 17771 pubs.lib.uiowa.edu 23 | 17220 www.cairn.info 24 | 17134 osf.io 25 | 17035 www.mdpi.com 26 | 15459 archive.org 27 | 12586 www.preprints.org 28 | 11171 ojs.omniscient.sg 29 | 10938 hal.science 30 | 9918 dl.acm.org 31 | ``` 32 | 33 | Out of these 2.5M URLs alone, we could guess about 21350 OAI/OJS endpoints, we 34 | did not know about before. 35 | 36 | More on the comparison: [https://git.archive.org/martin/scratch/-/tree/master/SPECPRJCTS-3102-Dimensions](https://git.archive.org/martin/scratch/-/tree/master/SPECPRJCTS-3102-Dimensions) 37 | 38 | -------------------------------------------------------------------------------- /notes/tasks/2020-01-27_cleanup_cdx.md: -------------------------------------------------------------------------------- 1 | 2 | Accidentally seem to have backfilled many CDX lines with non-PDF content. 3 | Should clear these out! 4 | 5 | Something like: 6 | 7 | mimetype = 'text/html' 8 | not in file_meta 9 | 10 | Or maybe instead: 11 | 12 | mimetype = 'text/html' 13 | not in file_meta 14 | 15 | SQL: 16 | 17 | SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5; 18 | SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01'; 19 | => 24841846 20 | 21 | SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5; 22 | SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL; 23 | => 24547552 24 | 25 | DELETE FROM cdx 26 | WHERE sha1hex IN 27 | (SELECT cdx.sha1hex 28 | FROM cdx 29 | LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex 30 | WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL); 31 | => DELETE 24553428 32 | 33 | Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in 34 | the DELETE WHERE clause. 35 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/HBaseCountJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import parallelai.spyglass.base.JobBase 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 10 | import parallelai.spyglass.hbase.HBasePipeConversions 11 | import parallelai.spyglass.hbase.HBaseSource 12 | 13 | class HBaseCountJob(args: Args, colSpec: String) extends JobBase(args) with HBasePipeConversions { 14 | val output = args("output") 15 | HBaseBuilder.parseColSpec(colSpec) 16 | val Col: String = colSpec.split(":")(1) 17 | 18 | HBaseCountJob.getHBaseSource( 19 | args("hbase-table"), 20 | args("zookeeper-hosts"), 21 | colSpec) 22 | .read 23 | .fromBytesWritable(Symbol(Col)) 24 | .debug 25 | .groupBy(Col){group => group.size('count)} 26 | .write(Tsv(output)) 27 | } 28 | 29 | object HBaseCountJob { 30 | def getHBaseSource(hbaseTable: String, zookeeperHosts: String, colSpec: String) : HBaseSource = HBaseBuilder.build( 31 | hbaseTable, // HBase Table Name 32 | zookeeperHosts, // HBase Zookeeper server (to get runtime config info; can be array?) 33 | List(colSpec), 34 | SourceMode.SCAN_ALL) 35 | } 36 | -------------------------------------------------------------------------------- /sql/migrations/00000000000000_diesel_initial_setup/up.sql: -------------------------------------------------------------------------------- 1 | -- This file was automatically created by Diesel to setup helper functions 2 | -- and other internal bookkeeping. This file is safe to edit, any future 3 | -- changes will be added to existing projects as new migrations. 4 | 5 | 6 | 7 | 8 | -- Sets up a trigger for the given table to automatically set a column called 9 | -- `updated_at` whenever the row is modified (unless `updated_at` was included 10 | -- in the modified columns) 11 | -- 12 | -- # Example 13 | -- 14 | -- ```sql 15 | -- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); 16 | -- 17 | -- SELECT diesel_manage_updated_at('users'); 18 | -- ``` 19 | CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ 20 | BEGIN 21 | EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s 22 | FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); 23 | END; 24 | $$ LANGUAGE plpgsql; 25 | 26 | CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ 27 | BEGIN 28 | IF ( 29 | NEW IS DISTINCT FROM OLD AND 30 | NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at 31 | ) THEN 32 | NEW.updated_at := current_timestamp; 33 | END IF; 34 | RETURN NEW; 35 | END; 36 | $$ LANGUAGE plpgsql; 37 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.pipe.Pipe 4 | import com.twitter.scalding.Args 5 | import com.twitter.scalding.TypedPipe 6 | import com.twitter.scalding.TypedTsv 7 | import parallelai.spyglass.base.JobBase 8 | 9 | class MatchBenchmarkJob(args: Args) extends JobBase(args) { 10 | // TODO: Instantiate any subclass of Scorable specified in args. 11 | val sc1 : Scorable = new BibjsonScorable() 12 | val sc2 : Scorable = new BibjsonScorable() 13 | val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson"))) 14 | val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson"))) 15 | val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs) 16 | val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs) 17 | 18 | pipe1.join(pipe2) 19 | .map { entry => 20 | val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry 21 | new ReduceOutput( 22 | slug, 23 | Scorable.computeSimilarity(features1, features2), 24 | features1.json, 25 | features2.json) 26 | } 27 | //TypedTsv doesn't work over case classes. 28 | .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } 29 | .write(TypedTsv[(String, Int, String, String)](args("output"))) 30 | } 31 | -------------------------------------------------------------------------------- /scalding/src/main/scala/example/SimpleHBaseSourceExample.scala: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import com.twitter.scalding.{Tsv, Args} 4 | import parallelai.spyglass.base.JobBase 5 | import org.apache.log4j.{Level, Logger} 6 | import parallelai.spyglass.hbase.{HBasePipeConversions, HBaseSource} 7 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 8 | import cascading.tuple.Fields 9 | import cascading.property.AppProps 10 | import java.util.Properties 11 | 12 | /** 13 | * Simple example of HBaseSource usage 14 | */ 15 | class SimpleHBaseSourceExample(args: Args) extends JobBase(args) with HBasePipeConversions { 16 | 17 | val isDebug: Boolean = args("debug").toBoolean 18 | 19 | if (isDebug) Logger.getRootLogger.setLevel(Level.DEBUG) 20 | 21 | val output = args("output") 22 | 23 | val hbs = new HBaseSource( 24 | "table_name", 25 | //"quorum_name:2181", 26 | "mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?) 27 | new Fields("key"), 28 | List("column_family"), 29 | List(new Fields("column_name1", "column_name2")), 30 | sourceMode = SourceMode.GET_LIST, keyList = List("1", "2", "3")) 31 | .read 32 | .debug 33 | .fromBytesWritable(new Fields("key", "column_name1", "column_name2")) 34 | .write(Tsv(output format "get_list")) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /scalding/src/test/scala/example/WordCountTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package example 17 | 18 | import com.twitter.scalding.{ JobTest, TextLine, TypedTsv } 19 | import org.scalatest.{ Matchers, WordSpec } 20 | 21 | class WordCountTest extends WordSpec with Matchers { 22 | "A WordCount job" should { 23 | JobTest(new example.WordCountJob(_)) 24 | .arg("input", "inputFile") 25 | .arg("output", "outputFile") 26 | .source(TextLine("inputFile"), List((0, "hack hack hack and hack"))) 27 | .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")){ outputBuffer => 28 | val outMap = outputBuffer.toMap 29 | "count words correctly" in { 30 | outMap("hack") shouldBe 4 31 | outMap("and") shouldBe 1 32 | } 33 | } 34 | .run 35 | .finish() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pig/filter-cdx-tarball.pig: -------------------------------------------------------------------------------- 1 | 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of tarballs 3 | -- (.tar.gz). Intention is to find software code that isn't in, eg, git. 4 | -- 5 | -- Author: Bryan Newbold 6 | -- Date: May 2018 7 | 8 | 9 | %default INPUT '' 10 | %default OUTPUT '' 11 | 12 | set mapreduce.job.queuename default 13 | 14 | cdx = LOAD '$INPUT' AS cdxline:chararray; 15 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 17 | 18 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 19 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; 20 | cdx = FILTER cdx BY not surt matches '-'; 21 | cdx = FILTER cdx BY httpstatus matches '200'; 22 | cdx = FILTER cdx BY mimetype matches '.*(octet|gzip|gtar|tgz).*'; 23 | 24 | -- This is the core regex 25 | cdx = FILTER cdx 26 | -- .tar.gz in URL 27 | BY surt matches '(?i).+\\).*\\.tar\\.gz.*'; 28 | 29 | -- DISTINCT by sha1 column 30 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) { 31 | r = TOP(1, 0, $1); 32 | GENERATE FLATTEN(r); 33 | }; 34 | 35 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50; 36 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline; 37 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' '); 38 | 39 | -------------------------------------------------------------------------------- /python/sandcrawler/__init__.py: -------------------------------------------------------------------------------- 1 | from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient 2 | from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker 3 | from .ia import ( 4 | CdxApiClient, 5 | CdxApiError, 6 | CdxPartial, 7 | CdxRow, 8 | PetaboxError, 9 | ResourceResult, 10 | SavePageNowBackoffError, 11 | SavePageNowClient, 12 | SavePageNowError, 13 | WarcResource, 14 | WaybackClient, 15 | WaybackContentError, 16 | WaybackError, 17 | ) 18 | from .ingest_file import IngestFileWorker 19 | from .ingest_fileset import IngestFilesetWorker 20 | from .misc import ( 21 | b32_hex, 22 | clean_url, 23 | gen_file_metadata, 24 | gen_file_metadata_path, 25 | parse_cdx_datetime, 26 | parse_cdx_line, 27 | ) 28 | from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker 29 | from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker 30 | from .persist import ( 31 | PersistCdxWorker, 32 | PersistGrobidDiskWorker, 33 | PersistGrobidWorker, 34 | PersistIngestFileResultWorker, 35 | PersistIngestRequestWorker, 36 | PersistPdfTextWorker, 37 | PersistPdfTrioWorker, 38 | PersistThumbnailWorker, 39 | ) 40 | from .workers import ( 41 | BlackholeSink, 42 | CdxLinePusher, 43 | JsonLinePusher, 44 | KafkaCompressSink, 45 | KafkaJsonPusher, 46 | KafkaSink, 47 | MultiprocessWrapper, 48 | ZipfilePusher, 49 | ) 50 | -------------------------------------------------------------------------------- /sql/stats/2021-12-02_table_sizes.txt: -------------------------------------------------------------------------------- 1 | 2 | Size: 940.66G 3 | 4 | table_name | table_size | indexes_size | total_size 5 | ------------------------------------+------------+--------------+------------ 6 | "public"."crossref" | 394 GB | 10138 MB | 404 GB 7 | "public"."ingest_request" | 44 GB | 41 GB | 85 GB 8 | "public"."cdx" | 52 GB | 28 GB | 80 GB 9 | "public"."grobid" | 72 GB | 6978 MB | 79 GB 10 | "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB 11 | "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB 12 | "public"."file_meta" | 34 GB | 21 GB | 55 GB 13 | "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB 14 | "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB 15 | "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB 16 | "public"."shadow" | 9517 MB | 8026 MB | 17 GB 17 | "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB 18 | "public"."petabox" | 403 MB | 461 MB | 864 MB 19 | "public"."pdftrio" | 550 MB | 297 MB | 847 MB 20 | "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB 21 | "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes 22 | (16 rows) 23 | -------------------------------------------------------------------------------- /sql/stats/2020-01-31_supplement.txt: -------------------------------------------------------------------------------- 1 | 2 | How many file_meta still missing core metadata? 3 | 4 | SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; 5 | => 1,130,915 6 | 7 | Great! Not many. 8 | 9 | And are in petabox? 10 | 11 | SELECT COUNT(*) 12 | FROM file_meta 13 | LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex 14 | WHERE file_meta.sha256hex IS NULL 15 | AND file_meta.sha1hex IS NOT NULL; 16 | => 1,149,194 17 | 18 | Almost all; maybe just some CDX fetch failures or something in there. So, 19 | should run these on, eg, grobid2-vm. 20 | 21 | COPY ( 22 | SELECT row_to_json(petabox.*) 23 | FROM file_meta 24 | LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex 25 | WHERE file_meta.sha256hex IS NULL 26 | AND file_meta.sha1hex IS NOT NULL 27 | ) TO '/grande/snapshots/dump_grobid_petabox_todo.json'; 28 | 29 | Count of PDF files that GROBID processed and matched to a release (via 30 | glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a 31 | couple million files): 32 | 33 | SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count 34 | FROM grobid 35 | LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex 36 | WHERE fatcat_file.sha1hex IS NULL 37 | AND grobid.fatcat_release IS NOT NULL; 38 | 39 | total_count | count 40 | -------------+--------- 41 | 5072452 | 4130405 42 | 43 | -------------------------------------------------------------------------------- /sql/stats/2022-11-23_table_sizes.txt: -------------------------------------------------------------------------------- 1 | PostgreSQL 13.2 - wbgrp-svc506.us.archive.org 2 | Size: 1.13T 3 | 4 | table_name | table_size | indexes_size | total_size 5 | ------------------------------------+------------+--------------+------------ 6 | "public"."crossref" | 459 GB | 10 GB | 470 GB 7 | "public"."grobid" | 98 GB | 13 GB | 112 GB 8 | "public"."cdx" | 63 GB | 45 GB | 108 GB 9 | "public"."ingest_request" | 53 GB | 52 GB | 105 GB 10 | "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB 11 | "public"."file_meta" | 39 GB | 40 GB | 79 GB 12 | "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB 13 | "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB 14 | "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB 15 | "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB 16 | "public"."shadow" | 9517 MB | 8026 MB | 17 GB 17 | "public"."html_meta" | 7879 MB | 68 MB | 7947 MB 18 | "public"."petabox" | 403 MB | 461 MB | 864 MB 19 | "public"."pdftrio" | 550 MB | 297 MB | 847 MB 20 | "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB 21 | "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes 22 | -------------------------------------------------------------------------------- /proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md: -------------------------------------------------------------------------------- 1 | 2 | status: brainstorming 3 | 4 | We continue to see issues with heritrix3-based crawling. Would like to have an 5 | option to switch to higher-throughput heritrix-based crawling. 6 | 7 | SPNv2 path would stick around at least for save-paper-now style ingest. 8 | 9 | 10 | ## Sketch 11 | 12 | Ingest requests are created continuously by fatcat, with daily spikes. 13 | 14 | Ingest workers run mostly in "bulk" mode, aka they don't make SPNv2 calls. 15 | `no-capture` responses are recorded in sandcrawler SQL database. 16 | 17 | Periodically (daily?), a script queries for new no-capture results, filtered to 18 | the most recent period. These are processed in a bit in to a URL list, then 19 | converted to a heritrix frontier, and sent to crawlers. This could either be an 20 | h3 instance (?), or simple `scp` to a running crawl directory. 21 | 22 | The crawler crawls, with usual landing page config, and draintasker runs. 23 | 24 | TODO: can we have draintasker/heritrix set a maximum WARC life? Like 6 hours? 25 | or, target a smaller draintasker item size, so they get updated more frequently 26 | 27 | Another SQL script dumps ingest requests from the *previous* period, and 28 | re-submits them for bulk-style ingest (by workers). 29 | 30 | The end result would be things getting crawled and updated within a couple 31 | days. 32 | 33 | 34 | ## Sketch 2 35 | 36 | Upload URL list to petabox item, wait for heritrix derive to run (!) 37 | -------------------------------------------------------------------------------- /pig/filter-cdx-source-code-crude.pig: -------------------------------------------------------------------------------- 1 | 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of source code 3 | -- files by mimetype and file extension. 4 | -- This is pretty crude and requires the URL to end with the file extension. 5 | --- 6 | -- Author: Bryan Newbold 7 | -- Date: October 2019 8 | 9 | 10 | %default INPUT '' 11 | %default OUTPUT '' 12 | 13 | set mapreduce.job.queuename default 14 | 15 | cdx = LOAD '$INPUT' AS cdxline:chararray; 16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 17 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 18 | 19 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 20 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; 21 | cdx = FILTER cdx BY not surt matches '-'; 22 | cdx = FILTER cdx BY httpstatus matches '200'; 23 | cdx = FILTER cdx BY mimetype matches '.*text.*'; 24 | 25 | -- This is the core regex 26 | cdx = FILTER cdx 27 | 28 | -- file suffix 29 | BY surt matches '.*\\).*\\.(c|h|py|java)'; 30 | 31 | -- DISTINCT by sha1 column 32 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) { 33 | r = TOP(1, 0, $1); 34 | GENERATE FLATTEN(r); 35 | }; 36 | 37 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50; 38 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline; 39 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' '); 40 | 41 | -------------------------------------------------------------------------------- /notes/match_filter_enrich.txt: -------------------------------------------------------------------------------- 1 | 2 | This could all be a single scalding job eventually. 3 | 4 | First, run matchcrossref and dumpfilemeta, and copy the output down to an SSD 5 | somewhere. 6 | 7 | bnewbold@ia601101$ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | wc -l 8 | 30728100 9 | 10 | Reduce down the scored matches to just {sha1, dois}, sorted: 11 | 12 | zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort -S 8G > 2018-08-27-2352.17-matchcrossref.filtered.tsv 13 | # 5.79M 0:18:54 [5.11k/s] 14 | 15 | Join/merge the output: 16 | 17 | zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | LC_ALL=C join -t$'\t' 2018-08-27-2352.17-matchcrossref.filtered.tsv - | pv -l | ./enrich_scored_matches.py | gzip > 2018-08-27-2352.17-matchcrossref.insertable.json.gz 18 | # 5.79M 0:09:09 [10.5k/s] 19 | 20 | ## Fatcat Insertable 21 | 22 | I can't remember now what the plan was for the 'insertable' output mode, which 23 | bundles {key, cdx, mime, and size} info along with the {slug, score, json1, 24 | json2} columns from the regular match script. The filter_scored_matches.py 25 | doesn't know what to do with those columns at the moment, and the output isn't 26 | sorted by slug... need to tweak scripts to fix this. 27 | 28 | In the meanwhile, as a work around just take the columns we want and re-sort: 29 | 30 | export LC_ALL=C 31 | zcat 2018-12-18-2237.09-matchcrossref.insertable.tsv.gz | cut -f2-5 | sort -S 8G -u | gzip > 2018-12-18-2237.09-matchcrossref.tsv.gz 32 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | // Dumps status code for each GROBID-processed file. Good for crawl/corpus 17 | // analytics, if we consider GROBID status a rough "is this a paper" metric. 18 | class DumpGrobidStatusCodeJob(args: Args) extends JobBase(args) with HBasePipeConversions { 19 | 20 | val metaPipe : TypedPipe[(String, Long)] = HBaseBuilder.build(args("hbase-table"), 21 | args("zookeeper-hosts"), 22 | List("grobid0:status_code"), 23 | SourceMode.SCAN_ALL) 24 | .read 25 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "status_code")) 26 | .filter { case (_, status_code) => status_code != null } 27 | .map { case (key, status_code) => 28 | (Bytes.toString(key.copyBytes()), 29 | Bytes.toLong(status_code.copyBytes())) 30 | }; 31 | 32 | metaPipe.write(TypedTsv[(String,Long)](args("output"))) 33 | 34 | } 35 | -------------------------------------------------------------------------------- /pig/tests/files/papers_edu_tilde.cdx: -------------------------------------------------------------------------------- 1 | #http://www.stanford.edu:80/~johntayl/Papers/taylor2.pdf 2 | #http://met.nps.edu/~mtmontgo/papers/isabel_part2.pdf 3 | #http://www.pitt.edu:80/~druzdzel/psfiles/ecai06.pdf 4 | #http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf 5 | 6 | # should be 6 matches: 7 | hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 LQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 8 | edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 XQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 9 | edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 PQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 10 | edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 11 | jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 12 | co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 13 | 14 | # NOT: 15 | com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz 16 | -------------------------------------------------------------------------------- /fetch_hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script was originally only for pig scripts; now it can also be used to 4 | # run scalding code locally (via please) 5 | 6 | set -euo pipefail 7 | 8 | #PIG_VERSION="0.12.0-cdh5.2.0" 9 | # Using more recent version to work around snappy classpath problem 10 | PIG_VERSION="0.17.0" 11 | HADOOP_VERSION="2.3.0-cdh5.0.1" 12 | 13 | mkdir -p pig/deps/ 14 | cd pig/deps/ 15 | 16 | # Fetch Hadoop Command 17 | echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz 18 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz 19 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz 20 | wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz 21 | echo "Extracting Hadoop (takes a minute)..." 22 | tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null 23 | ln -fs hadoop-${HADOOP_VERSION} hadoop 24 | 25 | # Fetch Pig 26 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz 27 | #wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz 28 | wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz 29 | echo "Extracting Pig (takes a minute)..." 30 | tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null 31 | ln -fs pig-${PIG_VERSION} pig 32 | 33 | # No 'readlink -f' on macOS 34 | # https://stackoverflow.com/a/24572274/4682349 35 | JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::") 36 | ./pig/bin/pig -x local -version 37 | ./hadoop/bin/hadoop version 38 | 39 | -------------------------------------------------------------------------------- /python/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "ia" 3 | url = "https://devpi.us.archive.org/wb/prod" 4 | verify_ssl = true 5 | 6 | [[source]] 7 | name = "pypi" 8 | url = "https://pypi.python.org/simple" 9 | verify_ssl = true 10 | 11 | [dev-packages] 12 | pytest = ">=4" 13 | pytest-pythonpath = "*" 14 | pytest-pylint = "*" 15 | responses = ">=0.10" 16 | pytest-cov = "*" 17 | pytest-mock = "*" 18 | pylint = "*" 19 | ipython = "*" 20 | mypy = "*" 21 | flake8 = "*" 22 | flake8-annotations = "*" 23 | isort = "*" 24 | types-requests = "*" 25 | types-beautifulsoup4 = "*" 26 | types-dateparser = "*" 27 | types-psycopg2 = "*" 28 | types-Pillow = "*" 29 | black = "*" 30 | 31 | [packages] 32 | requests = ">=2" 33 | confluent-kafka = "*" 34 | python-snappy = "*" 35 | boto3 = "*" 36 | minio = "<7.0.0" 37 | psycopg2 = "*" 38 | bs4 = "*" 39 | python-magic = "*" 40 | ftfy = "*" 41 | internetarchive = "*" 42 | urlcanon = "*" 43 | Pillow = ">=3" 44 | python-poppler = ">=0.2.1" 45 | selectolax = ">=0.2" 46 | # constraining trafilatura to prevent a version conflict with 47 | # `charset_normalizer`, between htmldate and requests 48 | trafilatura = ">=1,<1.4" 49 | htmldate= ">=1,<1.4" 50 | pydantic = ">=1.7" 51 | dateparser = "*" 52 | braveblock = "*" 53 | dynaconf = ">=3" 54 | sentry-sdk = { version = ">=0.14.0", extras = [] } 55 | zstandard = "*" 56 | grobid_tei_xml = ">=0.1.2,<0.2.0" 57 | PyMuPDF = ">=1.19.0,<1.20.0" 58 | 59 | [requires] 60 | python_version = "3.8" 61 | 62 | [packages.globalwayback] 63 | version = ">=0.6.5" 64 | index = "ia" 65 | 66 | [packages.wayback] 67 | version = ">=0.6.3" 68 | index = "ia" 69 | -------------------------------------------------------------------------------- /notes/old_extract_results.txt: -------------------------------------------------------------------------------- 1 | 2 | command: 3 | 4 | ./extraction_cdx_grobid.py --hbase-table wbgrp-journal-extract-0-qa --hbase-host bnewbold-dev.us.archive.org --grobid-uri http://wbgrp-svc096.us.archive.org:8070 -r hadoop -c mrjob.conf --archive $VENVSHORT.tar.gz#venv hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx --jobconf mapred.line.input.format.linespermap=8000 --jobconf mapreduce.job.queuename=extraction 5 | 6 | Started: Wed Apr 11 05:54:54 UTC 2018 7 | Finished: Sun Apr 15 20:42:37 UTC 2018 8 | (late saturday night PST fixed grobid parallelism) 9 | 10 | Elapsed: 110hrs, 47mins, 42sec 11 | 12 | line counts: 13 | error 3896 14 | existing 311209 15 | invalid 2311343 16 | skip 195641 17 | success 1143094 18 | total 3,965,183 19 | 20 | ## Against prod table 21 | 22 | Started: Sun Apr 15 21:38:24 UTC 2018 23 | Finished: Wed Apr 18 17:36:44 UTC 2018 24 | Elapsed: 67hrs, 58mins, 20sec 25 | 26 | lines 27 | error 143 28 | existing 213292 29 | invalid 2311343 30 | skip 195641 31 | success 1,244,764 32 | total 3,965,183 33 | 34 | ## TARGETED 35 | 36 | Job job_1513499322977_358533 failed with state FAILED due to: Task failed task_1513499322977_358533_m_000323 37 | 38 | Started: Thu Apr 19 05:21:25 UTC 2018 39 | Finished: Sat Apr 21 11:01:58 UTC 2018 40 | Elapsed: 53hrs, 40mins, 33sec 41 | 42 | lines 43 | error=4093 44 | existing=55448 45 | invalid=688873 46 | skip=257533 47 | success=1,282,053 48 | total=2,288,000 49 | 50 | 51 | -------------------------------------------------------------------------------- /python_hadoop/tests/files/small.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Dummy Example File", 3 | "authors": [ 4 | {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"}, 5 | {"name": "J Doe", "given_name": "J", "surname": "Doe"} 6 | ], 7 | "journal": { 8 | "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", 9 | "eissn": null, 10 | "issn": null, 11 | "issue": null, 12 | "publisher": null, 13 | "volume": null 14 | }, 15 | "date": "2000", 16 | "doi": null, 17 | "citations": [ 18 | { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}], 19 | "date": "2001", 20 | "id": "b0", 21 | "index": 0, 22 | "issue": null, 23 | "journal": "Letters in the Alphabet", 24 | "publisher": null, 25 | "title": "Everything is Wonderful", 26 | "url": null, 27 | "volume": "20"}, 28 | { "authors": [], 29 | "date": "2011-03-28", 30 | "id": "b1", 31 | "index": 1, 32 | "issue": null, 33 | "journal": "The Dictionary", 34 | "publisher": null, 35 | "title": "All about Facts", 36 | "url": null, 37 | "volume": "14"} 38 | ], 39 | "abstract": "Everything you ever wanted to know about nothing", 40 | "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", 41 | "acknowledgement": null, 42 | "annex": null, 43 | "fatcat_release": null, 44 | "grobid_timestamp": "2018-04-02T00:31+0000", 45 | "grobid_version": "0.5.1-SNAPSHOT" 46 | } 47 | -------------------------------------------------------------------------------- /sql/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL=/bin/bash -euo pipefail 3 | TODAY ?= $(shell date --iso --utc) 4 | DATADIR ?= /srv/sandcrawler/tasks/$(TODAY) 5 | DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) 6 | DATABASE_URL ?= sandcrawler 7 | 8 | .PHONY: help 9 | help: ## Print info about all commands 10 | @echo "Commands:" 11 | @echo 12 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' 13 | 14 | .PHONY: create_datadir 15 | create_datadir: 16 | mkdir -p $(DATADIR)/ 17 | sudo chmod a+rw $(DATADIR)/ 18 | 19 | $(DATADIR)/.DB_DUMP: 20 | sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip 21 | mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump 22 | touch $@ 23 | 24 | .PHONY: database-snapshot 25 | database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot 26 | @echo 27 | 28 | $(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP 29 | ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))" 30 | ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump 31 | touch $@ 32 | 33 | .PHONY: upload-database-snapshot 34 | upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org 35 | @echo 36 | -------------------------------------------------------------------------------- /extra/hbase/howto.md: -------------------------------------------------------------------------------- 1 | 2 | Commands can be run from any cluster machine with hadoop environment config 3 | set up. Most of these commands are run from the shell (start with `hbase 4 | shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be 5 | QA/prod tables, but there are not QA/prod clusters. 6 | 7 | ## Create Table 8 | 9 | Create column families (note: not all individual columns) with something like: 10 | 11 | create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'} 12 | 13 | ## Run Thrift Server Informally 14 | 15 | The Thrift server can technically be run from any old cluster machine that has 16 | Hadoop client stuff set up, using: 17 | 18 | hbase thrift start -nonblocking -c 19 | 20 | Note that this will run version 0.96, while the actual HBase service seems to 21 | be running 0.98. 22 | 23 | To interact with this config, use happybase (python) config: 24 | 25 | conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact") 26 | # Test connection 27 | conn.tables() 28 | 29 | ## Queries From Shell 30 | 31 | Fetch all columns for a single row: 32 | 33 | hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ' 34 | 35 | Fetch multiple columns for a single row, using column families: 36 | 37 | hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file' 38 | 39 | Scan a fixed number of rows (here 5) starting at a specific key prefix, all 40 | columns: 41 | 42 | hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'} 43 | -------------------------------------------------------------------------------- /pig/filter-cdx-join-urls.pig: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Author: Bryan Newbold 4 | -- Date: May 2018 5 | 6 | %default INPUT_CDX '' 7 | %default INPUT_URLS '' 8 | %default OUTPUT '' 9 | 10 | REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar; 11 | REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar; 12 | DEFINE SURTURL pigtools.SurtUrlKey(); 13 | 14 | set mapreduce.job.queuename default 15 | 16 | urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray; 17 | surts = FOREACH urls GENERATE SURTURL(url) AS url_surt; 18 | surts = ORDER surts by url_surt ASC PARALLEL 10; 19 | surts = DISTINCT surts; 20 | 21 | cdx = LOAD '$INPUT_CDX' AS cdxline:chararray; 22 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 23 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 24 | 25 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 26 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; 27 | cdx = FILTER cdx BY not cdx_surt matches '-'; 28 | cdx = FILTER cdx BY httpstatus matches '200'; 29 | cdx = FILTER cdx BY mimetype matches '.*pdf.*'; 30 | 31 | -- Core JOIN 32 | full_join = JOIN cdx BY cdx_surt, surts BY url_surt; 33 | 34 | -- DISTINCT by sha1 column 35 | full_uniq = FOREACH (GROUP full_join BY sha1sum) { 36 | r = TOP(1, 0, $1); 37 | GENERATE FLATTEN(r); 38 | }; 39 | 40 | result = FOREACH full_uniq GENERATE cdxline; 41 | result = DISTINCT result; 42 | 43 | STORE result INTO '$OUTPUT' USING PigStorage(); 44 | -------------------------------------------------------------------------------- /sql/dump_reingest_bulk.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON 7 | ingest_file_result.base_url = ingest_request.base_url 8 | AND ingest_file_result.ingest_type = ingest_request.ingest_type 9 | WHERE 10 | (ingest_request.ingest_type = 'pdf' 11 | OR ingest_request.ingest_type = 'html') 12 | AND ingest_file_result.hit = false 13 | AND ingest_request.created < NOW() - '24 hour'::INTERVAL 14 | AND ingest_request.created > NOW() - '181 day'::INTERVAL 15 | AND (ingest_request.ingest_request_source = 'fatcat-changelog' 16 | OR ingest_request.ingest_request_source = 'fatcat-ingest') 17 | AND ( 18 | ingest_file_result.status like 'spn2-%' 19 | OR ingest_file_result.status like 'cdx-error' 20 | OR ingest_file_result.status like 'petabox-error' 21 | ) 22 | AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' 23 | AND ingest_file_result.status != 'spn2-error:filesize-limit' 24 | AND ingest_file_result.status != 'spn2-error:not-found' 25 | AND ingest_file_result.status != 'spn2-error:blocked-url' 26 | AND ingest_file_result.status != 'spn2-error:too-many-redirects' 27 | AND ingest_file_result.status != 'spn2-error:network-authentication-required' 28 | AND ingest_file_result.status != 'spn2-error:unknown' 29 | ) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; 30 | 31 | ROLLBACK; 32 | -------------------------------------------------------------------------------- /sql/dump_reingest_terminalstatus.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON 7 | ingest_file_result.base_url = ingest_request.base_url 8 | AND ingest_file_result.ingest_type = ingest_request.ingest_type 9 | WHERE 10 | ingest_file_result.hit = false 11 | AND ingest_request.created < NOW() - '72 hour'::INTERVAL 12 | AND ingest_request.created > NOW() - '10 day'::INTERVAL 13 | AND (ingest_request.ingest_request_source = 'fatcat-changelog' 14 | OR ingest_request.ingest_request_source = 'fatcat-ingest') 15 | AND ingest_file_result.status = 'terminal-bad-status' 16 | AND ( 17 | ingest_file_result.terminal_status_code = 500 18 | OR ingest_file_result.terminal_status_code = 502 19 | OR ingest_file_result.terminal_status_code = 503 20 | OR ingest_file_result.terminal_status_code = 429 21 | OR ingest_file_result.terminal_status_code = 404 22 | ) 23 | AND ( 24 | ingest_request.base_url LIKE 'https://doi.org/10.3390/%' 25 | OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' 26 | OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' 27 | ) 28 | ) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; 29 | 30 | -- bulk re-tries would be: 31 | -- AND (ingest_request.ingest_request_source != 'fatcat-changelog' 32 | -- AND ingest_request.ingest_request_source != 'fatcat-ingest') 33 | 34 | ROLLBACK; 35 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | // Dumps all the info needed to insert a file entity in Fatcat. Useful for 17 | // joining. 18 | class DumpFileMetaJob(args: Args) extends JobBase(args) with HBasePipeConversions { 19 | 20 | val metaPipe : TypedPipe[(String, String, String, Long)] = HBaseBuilder.build(args("hbase-table"), 21 | args("zookeeper-hosts"), 22 | List("file:cdx", "file:mime", "file:size"), 23 | SourceMode.SCAN_ALL) 24 | .read 25 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size")) 26 | .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null } 27 | .map { case (key, cdx, mime, size) => 28 | (Bytes.toString(key.copyBytes()), 29 | Bytes.toString(cdx.copyBytes()), 30 | Bytes.toString(mime.copyBytes()), 31 | Bytes.toLong(size.copyBytes())) 32 | }; 33 | 34 | metaPipe.write(TypedTsv[(String,String,String,Long)](args("output"))) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.tuple.Fields 4 | import org.scalatest._ 5 | 6 | class HBaseBuilderTest extends FlatSpec with Matchers { 7 | "parseColSpecs()" should "work on legal nontrivial input" in { 8 | val (fams, fields) = HBaseBuilder.parseColSpecs(List("file:size", "file:cdx", "match0:status")) 9 | fams should have length 2 10 | fields should have length 2 11 | val fileIndex = fams.indexOf("file") 12 | fileIndex should not be -1 13 | fields(fileIndex) should be (new Fields("size", "cdx")) 14 | val match0Index = fams.indexOf("match0") 15 | match0Index should not be -1 16 | fields(match0Index) should be (new Fields("status")) 17 | } 18 | 19 | it should "work on empty input" in { 20 | val (fams, fields) = HBaseBuilder.parseColSpecs(List()) 21 | fams should have length 0 22 | fields should have length 0 23 | } 24 | 25 | //scalastyle:off no.whitespace.before.left.bracket 26 | it should "throw IllegalArgumentException on malformed input" in { 27 | a [IllegalArgumentException] should be thrownBy { 28 | HBaseBuilder.parseColSpecs(List("file_size")) 29 | } 30 | } 31 | 32 | it should "throw IllegalArgumentException on nonexistent family" in { 33 | a [IllegalArgumentException] should be thrownBy { 34 | HBaseBuilder.parseColSpecs(List("foo:bar")) 35 | } 36 | } 37 | 38 | it should "throw IllegalArgumentException on nonexistent column" in { 39 | a [IllegalArgumentException] should be thrownBy { 40 | HBaseBuilder.parseColSpecs(List("file:bar")) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /notes/ingest/2020-01-14_bulk.md: -------------------------------------------------------------------------------- 1 | 2 | Generate ingest requests from arabesque: 3 | 4 | zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json 5 | 6 | zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json 7 | 8 | 9 | Quick tests locally: 10 | 11 | time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json 12 | time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json 13 | 14 | These are all wayback success; looking good! Single threaded, from home laptop 15 | (over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even 16 | with 30x parallelism. Should re-test on actual server. GROBID pre-check should 17 | help? 18 | 19 | With new bulk topic: 20 | 21 | head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 22 | 23 | Ok, let them rip: 24 | 25 | cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 26 | cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 27 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.pipe.Pipe 4 | import com.twitter.scalding.Args 5 | import com.twitter.scalding.Stat 6 | import com.twitter.scalding.TypedPipe 7 | import com.twitter.scalding.TypedTsv 8 | import parallelai.spyglass.base.JobBase 9 | 10 | class GroupFatcatWorksJob(args: Args) extends JobBase(args) { 11 | 12 | val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler") 13 | val joinedRowCount = Stat("joined-rows", "sandcrawler") 14 | 15 | val fatcatScorable : Scorable = new FatcatScorable() 16 | val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable 17 | .getInputPipe(args) 18 | .map { r => 19 | fatcatRowCount.inc 20 | r 21 | } 22 | 23 | val joinedPipe = fatcatPipe 24 | .addTrap(TypedTsv(args("output") + ".trapped")) 25 | .join(fatcatPipe) 26 | 27 | // TypedTsv doesn't work over case classes. 28 | joinedPipe 29 | // filter out trivial self-matches (releases are identical) 30 | .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => 31 | Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight) 32 | } 33 | .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => 34 | joinedRowCount.inc 35 | new ReduceOutput( 36 | slug, 37 | Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight), 38 | fatcatFeaturesLeft.json, 39 | fatcatFeaturesRight.json) 40 | } 41 | .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } 42 | .write(TypedTsv[(String, Int, String, String)](args("output"))) 43 | } 44 | -------------------------------------------------------------------------------- /proposals/2021-09-13_src_ingest.md: -------------------------------------------------------------------------------- 1 | 2 | File Ingest Mode: 'src' 3 | ======================= 4 | 5 | Ingest type for "source" of works in document form. For example, tarballs of 6 | LaTeX source and figures, as published on arxiv.org and Pubmed Central. 7 | 8 | For now, presumption is that this would be a single file (`file` entity in 9 | fatcat). 10 | 11 | Initial mimetypes to allow: 12 | 13 | - text/x-tex 14 | - application/xml 15 | - application/gzip 16 | - application/x-bzip 17 | - application/x-bzip2 18 | - application/zip 19 | - application/x-tar 20 | - application/msword 21 | - application/vnd.openxmlformats-officedocument.wordprocessingml.document 22 | 23 | 24 | ## Fatcat Changes 25 | 26 | In the file importer, allow the additional mimetypes for 'src' ingest. 27 | 28 | Might keep ingest disabled on the fatcat side, at least initially. Eg, until 29 | there is some scope of "file scope", or other ways of treating 'src' tarballs 30 | separate from PDFs or other fulltext formats. 31 | 32 | 33 | ## Ingest Changes 34 | 35 | Allow additional terminal mimetypes for 'src' crawls. 36 | 37 | 38 | ## Examples 39 | 40 | arxiv:2109.00954v1 41 | fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy 42 | https://arxiv.org/format/2109.00954v1 43 | https://arxiv.org/e-print/2109.00954v1 44 | 45 | arxiv:1912.03397v2 46 | https://arxiv.org/format/1912.03397v2 47 | https://arxiv.org/e-print/1912.03397v2 48 | NOT: https://arxiv.org/pdf/1912.03397v2 49 | 50 | pmcid:PMC3767916 51 | https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz 52 | 53 | For PMC, will need to use one of the .csv file lists to get the digit prefixes. 54 | -------------------------------------------------------------------------------- /python/tests/files/small.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Dummy Example File", 3 | "authors": [ 4 | { 5 | "name": "Brewster Kahle", 6 | "given_name": "Brewster", 7 | "surname": "Kahle", 8 | "affiliation": { 9 | "department": "Faculty ofAgricultrial Engineering", 10 | "laboratory": "Plant Physiology Laboratory", 11 | "institution": "Technion-Israel Institute of Technology", 12 | "address": { 13 | "postCode": "32000", 14 | "settlement": "Haifa", 15 | "country": "Israel" 16 | } 17 | } 18 | }, 19 | {"name": "J Doe", "given_name": "J", "surname": "Doe"} 20 | ], 21 | "journal": { 22 | "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678" 23 | }, 24 | "date": "2000", 25 | "citations": [ 26 | { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}], 27 | "date": "2001", 28 | "id": "b0", 29 | "index": 0, 30 | "journal": "Letters in the Alphabet", 31 | "pages": "1-11", 32 | "title": "Everything is Wonderful", 33 | "volume": "20"}, 34 | { "authors": [], 35 | "date": "2011-03-28", 36 | "id": "b1", 37 | "index": 1, 38 | "journal": "The Dictionary", 39 | "title": "All about Facts", 40 | "volume": "14"} 41 | ], 42 | "abstract": "Everything you ever wanted to know about nothing", 43 | "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", 44 | "grobid_timestamp": "2018-04-02T00:31+0000", 45 | "grobid_version": "0.5.1-SNAPSHOT", 46 | "language_code": "en" 47 | } 48 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/BibjsonScorable.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import scala.math 4 | import scala.util.parsing.json.JSON 5 | import scala.util.parsing.json.JSONObject 6 | 7 | import cascading.flow.FlowDef 8 | import cascading.tuple.Fields 9 | import com.twitter.scalding._ 10 | import com.twitter.scalding.typed.TDsl._ 11 | 12 | class BibjsonScorable extends Scorable { 13 | 14 | def getSource(args : Args) : Source = { 15 | TextLine(args("bibjson-input")) 16 | } 17 | 18 | def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = { 19 | getSource(args).read 20 | .toTypedPipe[String](new Fields("line")) 21 | .map { BibjsonScorable.bibjsonToMapFeatures(_) } 22 | } 23 | } 24 | 25 | object BibjsonScorable { 26 | def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = { 27 | Scorable.jsonToMap(json) match { 28 | case None => None 29 | case Some(map) => { 30 | if (map contains "title") { 31 | val title = Scorable.getString(map, "title") 32 | val doi = Scorable.getString(map, "doi") 33 | val sha1 = Scorable.getString(map, "sha") 34 | // TODO: year, authors (if available) 35 | if (title == null || title.isEmpty) { 36 | None 37 | } else { 38 | val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1) 39 | sf.toSlug match { 40 | case None => None 41 | case Some(slug) => Some(MapFeatures(slug, sf.toString)) 42 | } 43 | } 44 | } else { 45 | None 46 | } 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /sql/backfill/backfill_file_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | This is a "one-time" tranform helper script for file_meta backfill into 4 | sandcrawler postgresql. 5 | 6 | Most of this file was copied from '../python/common.py'. 7 | """ 8 | 9 | import json, os, sys, collections 10 | import psycopg2 11 | import psycopg2.extras 12 | 13 | 14 | def insert(cur, batch): 15 | sql = """ 16 | INSERT INTO 17 | file_meta 18 | VALUES %s 19 | ON CONFLICT DO NOTHING; 20 | """ 21 | res = psycopg2.extras.execute_values(cur, sql, batch) 22 | 23 | def stdin_to_pg(): 24 | # no host means it will use local domain socket by default 25 | conn = psycopg2.connect(database="sandcrawler", user="postgres") 26 | cur = conn.cursor() 27 | counts = collections.Counter({'total': 0}) 28 | batch = [] 29 | for l in sys.stdin: 30 | if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: 31 | print("Progress: {}...".format(counts)) 32 | counts['raw_lines'] += 1 33 | if not l.strip(): 34 | continue 35 | info = l.split("\t") 36 | if not info: 37 | continue 38 | assert len(info) == 5 39 | info[-1] = info[-1].strip() or None 40 | batch.append(info) 41 | counts['total'] += 1 42 | if len(batch) >= 1000: 43 | insert(cur, batch) 44 | conn.commit() 45 | batch = [] 46 | counts['batches'] += 1 47 | if batch: 48 | insert(cur, batch) 49 | batch = [] 50 | conn.commit() 51 | cur.close() 52 | print("Done: {}".format(counts)) 53 | 54 | if __name__=='__main__': 55 | stdin_to_pg() 56 | -------------------------------------------------------------------------------- /notes/ingest/2022-03_oaipmh.md: -------------------------------------------------------------------------------- 1 | 2 | Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl. 3 | 4 | Note that Martin excluded many Indonesian endpoints, will need to follow-up on 5 | those. 6 | 7 | ## Prep 8 | 9 | Fetch metadata snapshot: 10 | 11 | wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst 12 | 13 | wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst 14 | 15 | Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large): 16 | 17 | zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \ 18 | | rg -v 'oai:kb.dk:' \ 19 | | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \ 20 | | rg -v 'oai:hispana.mcu.es:' \ 21 | | rg -v 'oai:bnf.fr:' \ 22 | | rg -v 'oai:ukm.si:' \ 23 | | rg -v 'oai:biodiversitylibrary.org:' \ 24 | | rg -v 'oai:hsp.org:' \ 25 | | rg -v 'oai:repec:' \ 26 | | rg -v 'oai:n/a:' \ 27 | | rg -v 'oai:quod.lib.umich.edu:' \ 28 | | rg -v 'oai:americanae.aecid.es:' \ 29 | | rg -v 'oai:www.irgrid.ac.cn:' \ 30 | | rg -v 'oai:espace.library.uq.edu:' \ 31 | | rg -v 'oai:edoc.mpg.de:' \ 32 | | rg -v 'oai:bibliotecadigital.jcyl.es:' \ 33 | | rg -v 'oai:repository.erciyes.edu.tr:' \ 34 | | rg -v 'oai:krm.or.kr:' \ 35 | | ./scripts/oai2ingestrequest.py - \ 36 | | pv -l \ 37 | | gzip \ 38 | > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz 39 | 40 | These failed to transform in the expected way; a change in JSON schema from last time? 41 | -------------------------------------------------------------------------------- /pig/join-cdx-sha1.pig: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Author: Bryan Newbold 4 | -- Date: December 2020 5 | -- 6 | -- This pig script is intended to run agains the full (many TByte) GWB CDX, and 7 | -- catch captures that match exact SHA1 (b32 encoded), regardless of mimetype. 8 | -- 9 | -- The process is to filter the CDX for non-revisit HTTP 200s, sort this by 10 | -- SHA1 digest, then join with the (pre-sorted) SHA1 -- b32 input list, and dump 11 | -- output. 12 | 13 | %default INPUT_CDX '' 14 | %default INPUT_DIGEST '' 15 | %default OUTPUT '' 16 | 17 | set mapreduce.job.queuename default 18 | 19 | digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray; 20 | digests = ORDER digests by sha1b32 ASC PARALLEL 20; 21 | digests = DISTINCT digests; 22 | 23 | cdx = LOAD '$INPUT_CDX' AS cdxline:chararray; 24 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 25 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 26 | 27 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 28 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1b32, cdxline; 29 | cdx = FILTER cdx BY not cdx_surt matches '-'; 30 | cdx = FILTER cdx BY httpstatus matches '200'; 31 | cdx = FILTER cdx BY not mimetype matches 'warc/revisit'; 32 | cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40; 33 | 34 | -- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency 35 | 36 | -- Core JOIN 37 | full_join = JOIN cdx BY sha1b32, digests BY sha1b32; 38 | 39 | -- TODO: at most, say 5 CDX lines per sha1b32? 40 | 41 | result = FOREACH full_join GENERATE cdxline; 42 | 43 | STORE result INTO '$OUTPUT' USING PigStorage(); 44 | -------------------------------------------------------------------------------- /pig/tests/test_join_cdx.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | import tempfile 5 | import subprocess 6 | from pighelper import PigTestHelper, count_lines 7 | 8 | class TestJoinCDXSha1(PigTestHelper): 9 | 10 | def run_pig_join(self, script_path, cdx_file, digest_file, **kwargs): 11 | """Convenience helper around run_pig(). 12 | 13 | INPUT parameter is set to in_file. 14 | OUTPUT parameter is set to a random file. 15 | Any keyword args are passed as parameters. 16 | """ 17 | 18 | pargs = [] 19 | for key, value in kwargs.items(): 20 | pargs.append('-p') 21 | pargs.append('{}={}'.format(key, value)) 22 | 23 | out_file = tempfile.mktemp(dir=self._tmpdir) 24 | params = [ 25 | '-f', script_path, 26 | '-p', 'INPUT_CDX={}'.format(cdx_file), 27 | '-p', 'INPUT_DIGEST={}'.format(digest_file), 28 | '-p', 'OUTPUT={}'.format(out_file), 29 | ] + pargs 30 | status = self.run_pig_raw(params) 31 | assert status.returncode == 0 32 | # Capture all the part-r-* files 33 | print("out_file: {}".format(out_file)) 34 | subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True) 35 | sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True) 36 | out = sub.stdout.decode('utf-8') 37 | print(out) 38 | return out 39 | 40 | # TODO: helper to verify that output matches an expected file 41 | 42 | def test_thing(self): 43 | r = self.run_pig_join("join-cdx-sha1.pig", "tests/files/example.cdx", "tests/files/example.sha1b32") 44 | assert count_lines(r) == 4 45 | -------------------------------------------------------------------------------- /proposals/20201012_no_capture.md: -------------------------------------------------------------------------------- 1 | 2 | status: work-in-progress 3 | 4 | NOTE: as of December 2022, bnewbold can't remember if this was fully 5 | implemented or not. 6 | 7 | Storing no-capture missing URLs in `terminal_url` 8 | ================================================= 9 | 10 | Currently, when the bulk-mode ingest code terminates with a `no-capture` 11 | status, the missing URL (which is not in GWB CDX) is not stored in 12 | sandcrawler-db. This proposed change is to include it in the existing 13 | `terminal_url` database column, with the `terminal_status_code` and 14 | `terminal_dt` columns empty. 15 | 16 | The implementation is rather simple: 17 | 18 | - CDX lookup code path should save the *actual* final missing URL (`next_url` 19 | after redirects) in the result object's `terminal_url` field 20 | - ensure that this field gets passed through all the way to the database on the 21 | `no-capture` code path 22 | 23 | This change does change the semantics of the `terminal_url` field somewhat, and 24 | could break existing assumptions, so it is being documented in this proposal 25 | document. 26 | 27 | 28 | ## Alternatives 29 | 30 | The current status quo is to store the missing URL as the last element in the 31 | "hops" field of the JSON structure. We could keep this and have a convoluted 32 | pipeline that would read from the Kafka feed and extract them, but this would 33 | be messy. Eg, re-ingesting would not update the old kafka messages, so we could 34 | need some accounting of consumer group offsets after which missing URLs are 35 | truly missing. 36 | 37 | We could add a new `missing_url` database column and field to the JSON schema, 38 | for this specific use case. This seems like unnecessary extra work. 39 | 40 | -------------------------------------------------------------------------------- /notes/examples/dataset_examples.txt: -------------------------------------------------------------------------------- 1 | 2 | ### ArchiveOrg: CAT dataset 3 | 4 | 5 | 6 | `release_36vy7s5gtba67fmyxlmijpsaui` 7 | 8 | ### 9 | 10 | 11 | 12 | doi:10.1371/journal.pone.0120448 13 | 14 | Single .rar file 15 | 16 | ### Dataverse 17 | 18 | 19 | 20 | Single excel file 21 | 22 | ### Dataverse 23 | 24 | 25 | 26 | doi:10.7910/DVN/CLSFKX 27 | 28 | Mulitple files; multiple versions? 29 | 30 | API fetch: 31 | 32 | .data.id 33 | .data.latestVersion.datasetPersistentId 34 | .data.latestVersion.versionNumber, .versionMinorNumber 35 | .data.latestVersion.files[] 36 | .dataFile 37 | .contentType (mimetype) 38 | .filename 39 | .filesize (int, bytes) 40 | .md5 41 | .persistendId 42 | .description 43 | .label (filename?) 44 | .version 45 | 46 | Single file inside: 47 | 48 | Download single file: (redirects to AWS S3) 49 | 50 | Dataverse refs: 51 | - 'doi' and 'hdl' are the two persistentId styles 52 | - file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled 53 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | 2 | This directory contains `sandcrawler` python code for ingest pipelines, batch 3 | processing, PDF extraction, etc. 4 | 5 | 6 | ## Development Quickstart 7 | 8 | As of December 2022, working with this code requires: 9 | 10 | - Python 3.8 (specifically, due to version specification in `pipenv`) 11 | - `pipenv` for python dependency management 12 | - generic and python-specific build tools (`pkg-config`, `python-dev`, etc) 13 | - poppler (PDF processing library) 14 | - libmagic 15 | - libsodium 16 | - access to IA internal packages (`devpi.us.archive.org`), specifically for 17 | globalwayback and related packages 18 | 19 | In production and CI we use Ubuntu Focal (20.04). The CI script for this 20 | repository (`../.gitlab-ci.yml`) is the best place to look for a complete list 21 | of dependencies for both development and deployment. Note that our CI system 22 | runs from our cluster, which resolves the devpi access issue. For developer 23 | laptops, you may need `sshuttle` or something similar set up to do initial 24 | package pulls. 25 | 26 | It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when 27 | working with pipenv. You can include this in a `.env` file. 28 | 29 | There is a Makefile which helps with the basics. Eg: 30 | 31 | # install deps using pipenv 32 | make deps 33 | 34 | # run python tests 35 | make test 36 | 37 | # run code formatting and lint checks 38 | make fmt lint 39 | 40 | Sometimes when developing it is helpful to enter a shell with pipenv, eg: 41 | 42 | pipenv shell 43 | 44 | Often when developing it is helpful (or necessary) to set environment 45 | variables. `pipenv shell` will read from `.env`, so you can copy and edit 46 | `example.env`, and it will be used in tests, `pipenv shell`, etc. 47 | -------------------------------------------------------------------------------- /python/scripts/ingestrequest_row2json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | This script is used to turn ingest request postgres rows (in JSON export 4 | format) back in to regular ingest request JSON. 5 | 6 | The only difference is the name and location of some optional keys. 7 | """ 8 | 9 | import argparse 10 | import json 11 | import sys 12 | 13 | 14 | def transform(row): 15 | """ 16 | dict-to-dict 17 | """ 18 | row.pop("created", None) 19 | extra = row.pop("request", None) or {} 20 | for k in ("ext_ids", "edit_extra"): 21 | if k in extra: 22 | row[k] = extra[k] 23 | if "release_ident" in extra: 24 | row["fatcat"] = dict(release_ident=extra["release_ident"]) 25 | return row 26 | 27 | 28 | def run(args): 29 | for l in args.json_file: 30 | if not l.strip(): 31 | continue 32 | try: 33 | req = transform(json.loads(l)) 34 | except Exception as e: 35 | print(e, file=sys.stderr) 36 | print(l, file=sys.stderr) 37 | continue 38 | if args.force_recrawl: 39 | req["force_recrawl"] = True 40 | print(json.dumps(req, sort_keys=True)) 41 | 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 45 | parser.add_argument( 46 | "json_file", help="SQL output JSON file to process", type=argparse.FileType("r") 47 | ) 48 | parser.add_argument( 49 | "--force-recrawl", 50 | action="store_true", 51 | help="whether to add recrawl (SPNv2) flag to request", 52 | ) 53 | subparsers = parser.add_subparsers() 54 | 55 | args = parser.parse_args() 56 | 57 | run(args) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | import scala.util.parsing.json.JSONObject 16 | 17 | // Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV 18 | // columns: one is key, second is JSON). Used for partner delivery/sharing 19 | class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions { 20 | 21 | val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"), 22 | args("zookeeper-hosts"), 23 | List("file:cdx", "grobid0:tei_xml"), 24 | SourceMode.SCAN_ALL) 25 | .read 26 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml")) 27 | .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null } 28 | .map { case (key, cdx, tei_xml) => 29 | (Bytes.toString(key.copyBytes()), 30 | JSONObject( 31 | Map( 32 | "pdf_hash" -> Bytes.toString(key.copyBytes()), 33 | "cdx_metadata" -> Bytes.toString(cdx.copyBytes()), 34 | "tei_xml" -> Bytes.toString(tei_xml.copyBytes()) 35 | )).toString 36 | ) 37 | }; 38 | 39 | metaPipe.write(TypedTsv[(String,String)](args("output"))) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | 2 | image: ubuntu:focal 3 | 4 | variables: 5 | LC_ALL: "C.UTF-8" 6 | LANG: "C.UTF-8" 7 | DEBIAN_FRONTEND: "noninteractive" 8 | 9 | before_script: 10 | - apt update -qy 11 | - apt install -y --no-install-recommends apt-transport-https software-properties-common curl dirmngr gpg-agent 12 | # scala-sbt.org APT signing key 13 | - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0x2EE0EA64E40A89B84B2DF73499E82A75642AC823 14 | - apt-add-repository -y "deb https://repo.scala-sbt.org/scalasbt/debian all main" 15 | - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget 16 | - pipenv --version 17 | 18 | test_python: 19 | script: 20 | - cd python 21 | - cp example.env .env 22 | - pipenv install --dev --deploy 23 | - make coverage 24 | - make lint 25 | 26 | test_python_hadoop: 27 | when: manual 28 | script: 29 | - cd python_hadoop 30 | - pipenv install --dev --deploy 31 | - pipenv run pytest --cov 32 | 33 | # needs fixing; some upstream com.hadoop.gplcompression#hadoop-lzo;0.4.16: java.lang.NullPointerException 34 | # change happened 35 | test_scalding: 36 | when: manual 37 | script: 38 | - ./please -h 39 | - cd scalding 40 | - sbt -mem 1024 test 41 | - sbt -mem 1024 assembly 42 | 43 | # Needs fixing 44 | test_pig: 45 | when: manual 46 | script: 47 | - ./fetch_hadoop.sh 48 | - cd pig 49 | - pipenv install --dev --deploy 50 | - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest 51 | -------------------------------------------------------------------------------- /notes/ingest/NEXT.md: -------------------------------------------------------------------------------- 1 | 2 | biorxiv 3 | medrxiv 4 | doi:10.1101\/20* 5 | 6 | persee.fr 147k 7 | publisher:persee in_ia:false is_oa:true 8 | https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013 9 | 10 | cairn.info: 161k 11 | doi_prefix:10.3917 in_ia:false is_oa:true 12 | https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm 13 | https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm 14 | 15 | IOP OA: 169k 16 | doi_prefix:10.1088 is_oa:true in_ia:false 17 | 18 | indian journals platform? 124k 19 | doi_prefix:10.4103 in_ia:false is_oa:true 20 | http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad 21 | http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki 22 | 23 | openedition? 48k 24 | doi_prefix:10.4000 is_oa:true in_ia:false 25 | 26 | german medical science (GMS) 28k 27 | doi_prefix:10.3205 in_ia:false is_oa:true 28 | https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml 29 | 30 | siberian chemistry 28k 31 | doi_prefix:10.2298 in_ia:false is_oa:true 32 | http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H 33 | 34 | jalc oa doi: 82k 35 | doi_registrar:jalc in_ia:false is_oa:true 36 | 37 | sage OA papers 38 | https://journals.sagepub.com/doi/10.1177/034003529802400510 39 | 40 | Scientific Reports: 25k 41 | in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm" 42 | 43 | U Toronto press: 23k 44 | publisher:"Toronto Press" in_ia:false is_oa:true 45 | has an annoying bounce page 46 | 47 | ASHA (speech-language-hearing association): 7k 48 | publisher:Speech-Language-Hearing in_ia:false is_oa:true 49 | 50 | MIT press journals 51 | 52 | 53 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/ScoreJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.pipe.Pipe 4 | import com.twitter.scalding.Args 5 | import com.twitter.scalding.Stat 6 | import com.twitter.scalding.TypedPipe 7 | import com.twitter.scalding.TypedTsv 8 | import parallelai.spyglass.base.JobBase 9 | 10 | class ScoreJob(args: Args) extends JobBase(args) { 11 | 12 | val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler") 13 | val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler") 14 | val joinedRowCount = Stat("joined-rows", "sandcrawler") 15 | 16 | val grobidScorable : Scorable = new GrobidScorable() 17 | val crossrefScorable : Scorable = new CrossrefScorable() 18 | val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable 19 | .getInputPipe(args) 20 | .map { r => 21 | grobidRowCount.inc 22 | r 23 | } 24 | val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable 25 | .getInputPipe(args) 26 | .map { r => 27 | crossrefRowCount.inc 28 | r 29 | } 30 | 31 | val joinedPipe = grobidPipe 32 | .addTrap(TypedTsv(args("output") + ".trapped")) 33 | .join(crossrefPipe) 34 | 35 | // TypedTsv doesn't work over case classes. 36 | joinedPipe 37 | .map { case (slug, (grobidFeatures, crossrefFeatures)) => 38 | joinedRowCount.inc 39 | //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry 40 | new ReduceOutput( 41 | slug, 42 | Scorable.computeSimilarity(grobidFeatures, crossrefFeatures), 43 | grobidFeatures.json, 44 | crossrefFeatures.json) 45 | } 46 | .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } 47 | .write(TypedTsv[(String, Int, String, String)](args("output"))) 48 | } 49 | -------------------------------------------------------------------------------- /notes/tasks/2020-01-06_heuristic_cdx.txt: -------------------------------------------------------------------------------- 1 | 2 | Wanted to include a large number of additional CDX lines based on regex 3 | pattern. These are primarily .edu domains with things that look like user 4 | accounts *and* .pdf file extensions in the path. 5 | 6 | ## Commands 7 | 8 | aitio:/fast/gwb_pdfs 9 | 10 | pdfs/gwb-pdf-20191005172329-url-heuristics-edu 11 | pdfs/gwb-pdf-20191005172329-url-heuristics 12 | 13 | 14 | to filter as url/sha1 uniq: 15 | 16 | cat raw.cdx | sort -u -t' ' -k3,6 -S 4G > uniq.cdx 17 | 18 | cat gwb-pdf-20191005172329-url-heuristics-edu/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx 19 | cat gwb-pdf-20191005172329-url-heuristics/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx 20 | 21 | 7241795 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx 22 | 41137888 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx 23 | 24 | cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx | sort -u -S 4G | wc -l 25 | 7241795 26 | 27 | cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx | sort -u -S 4G | wc -l 28 | 41137888 29 | 30 | ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx 31 | Worker: Counter({'total': 7239153, 'insert-cdx': 6845283, 'update-cdx': 0}) 32 | CDX lines pushed: Counter({'total': 7241795, 'pushed': 7239153, 'skip-parse': 2603, 'skip-mimetype': 39}) 33 | 34 | ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx 35 | Worker: Counter({'total': 41030360, 'insert-cdx': 22430064, 'update-cdx': 0}) 36 | CDX lines pushed: Counter({'total': 41137888, 'pushed': 41030360, 'skip-mimetype': 87341, 'skip-parse': 20187}) 37 | 38 | -------------------------------------------------------------------------------- /pig/filter-cdx-paper-pdfs.pig: -------------------------------------------------------------------------------- 1 | 2 | -- Tries to filter down a large CDX file to a subset that is likely to be 3 | -- journal article content, based on SURT regex patterns. 4 | --- 5 | -- Author: Bryan Newbold 6 | -- Date: May 2018 7 | 8 | 9 | %default INPUT '' 10 | %default OUTPUT '' 11 | 12 | set mapreduce.job.queuename default 13 | 14 | cdx = LOAD '$INPUT' AS cdxline:chararray; 15 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); 16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); 17 | 18 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; 19 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; 20 | cdx = FILTER cdx BY not surt matches '-'; 21 | cdx = FILTER cdx BY httpstatus matches '200'; 22 | cdx = FILTER cdx BY mimetype matches '.*pdf.*'; 23 | 24 | -- This is the core regex 25 | cdx = FILTER cdx 26 | -- academic domains; personal (tilde) directories 27 | BY surt matches '(edu,|..,edu|..,ac,).*\\).*\\/~.*' 28 | 29 | -- words in URL 30 | OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' 31 | 32 | -- words in domains 33 | OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*' 34 | 35 | -- DOI-like pattern in URL 36 | OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; 37 | 38 | -- DISTINCT by sha1 column 39 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) { 40 | r = TOP(1, 0, $1); 41 | GENERATE FLATTEN(r); 42 | }; 43 | 44 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50; 45 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline; 46 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' '); 47 | 48 | -------------------------------------------------------------------------------- /python/scripts/manifest_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of 4 | "match" JSON objects which can be imported into fatcat with matched_import.py 5 | 6 | This was used to convert this manifest: 7 | 8 | https://archive.org/details/ia_papers_manifest_2018-01-25/ 9 | 10 | to JSON format for fast fatcat importing. 11 | """ 12 | 13 | import json 14 | import sqlite3 15 | import sys 16 | 17 | # iterate over rows in files metadata... 18 | # 1. select all identified DOIs 19 | # => filter based on count 20 | # 2. select all file metadata 21 | # 3. output object 22 | 23 | 24 | def or_none(s): 25 | if s is None: 26 | return None 27 | elif type(s) == str and ((not s) or s == "\\N" or s == "-"): 28 | return None 29 | return s 30 | 31 | 32 | def process_db(db_path): 33 | 34 | db = sqlite3.connect(db_path) 35 | 36 | for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): 37 | sha1 = row[0] 38 | dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() 39 | dois = [d[0] for d in dois] 40 | if not dois: 41 | continue 42 | urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() 43 | if not urls: 44 | continue 45 | cdx = [dict(url=row[0], dt=row[1]) for row in urls] 46 | obj = dict( 47 | sha1=sha1, 48 | mimetype=or_none(row[1]), 49 | size=(or_none(row[2]) and int(row[2])), 50 | md5=or_none(row[3]), 51 | dois=dois, 52 | cdx=cdx, 53 | ) 54 | dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) 55 | print(json.dumps(obj)) 56 | 57 | 58 | if __name__ == "__main__": 59 | process_db(sys.argv[1]) 60 | -------------------------------------------------------------------------------- /kafka/howto_rebalance.md: -------------------------------------------------------------------------------- 1 | 2 | ## Rebalance Storage Between Brokers (kafka-manager web) 3 | 4 | For each topic you want to rebalance (eg, the large or high-throughput ones), 5 | go to the topic page and do the blue "reassign partitions" button (or 6 | potentially "generate" or "manual"). 7 | 8 | Monitor progress with the "Reassign Partitions" link at top of the page. 9 | 10 | Finally, run a preferred replica election after partition movement is complete. 11 | 12 | ## Rebalance Storage Between Brokers (CLI) 13 | 14 | For example, after adding or removing brokers from the cluster. 15 | 16 | Create a list of topics to move, and put it in `/tmp/topics_to_move.json`: 17 | 18 | { 19 | "version": 1, 20 | "topics": [ 21 | {"topic": "sandcrawler-shadow.grobid-output"}, 22 | {"topic": "fatcat-prod.api-crossref"} 23 | ] 24 | } 25 | 26 | On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then 27 | inspect the output: 28 | 29 | ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json 30 | cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json 31 | cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json 32 | cat /tmp/reassignment-plan.json | rg '^\{' | jq . 33 | 34 | If that looks good, start the rebalance: 35 | 36 | ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute 37 | 38 | Then monitor progress: 39 | 40 | ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify 41 | 42 | Finally, run a preferred replica election after partition movement is complete. 43 | Currently do this through the web interface (linked above). 44 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | // Dumps the SHA1 key and grobid0:metadata columns, plus file metadata needed 17 | // to insert into fatcat. Used, eg, as part of long-tail mellon pipeline. 18 | class DumpGrobidMetaInsertableJob(args: Args) extends JobBase(args) with HBasePipeConversions { 19 | 20 | val metaPipe : TypedPipe[(String, String, String, Long, String)] = HBaseBuilder.build(args("hbase-table"), 21 | args("zookeeper-hosts"), 22 | List("file:cdx", "file:mime", "file:size", "grobid0:metadata"), 23 | SourceMode.SCAN_ALL) 24 | .read 25 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size", "metadata")) 26 | .filter { case (_, cdx, mime, size, metadata) => cdx != null && mime != null && size != null && metadata != null } 27 | .map { case (key, cdx, mime, size, metadata) => 28 | (Bytes.toString(key.copyBytes()), 29 | Bytes.toString(cdx.copyBytes()), 30 | Bytes.toString(mime.copyBytes()), 31 | Bytes.toLong(size.copyBytes()), 32 | Bytes.toString(metadata.copyBytes()) 33 | ) 34 | }; 35 | 36 | metaPipe.write(TypedTsv[(String,String,String,Long,String)](args("output"))) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /python/scripts/grobid_affiliations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction 4 | output, converts the XML to JSON, filters out raw affiliation strings, and 5 | dumps these as JSON subset. 6 | 7 | Run in bulk like: 8 | 9 | ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' 10 | """ 11 | 12 | import json 13 | import sys 14 | 15 | from grobid_tei_xml import parse_document_xml 16 | 17 | 18 | def parse_hbase(line): 19 | line = line.split("\t") 20 | assert len(line) == 2 21 | sha1hex = line[0] 22 | obj = json.loads(line[1]) 23 | tei_xml = obj["tei_xml"] 24 | return sha1hex, tei_xml 25 | 26 | 27 | def parse_pg(line): 28 | obj = json.loads(line) 29 | return obj["sha1hex"], obj["tei_xml"] 30 | 31 | 32 | def run(mode="hbase"): 33 | for line in sys.stdin: 34 | if mode == "hbase": 35 | sha1hex, tei_xml = parse_hbase(line) 36 | elif mode == "pg": 37 | sha1hex, tei_xml = parse_pg(line) 38 | else: 39 | raise NotImplementedError("parse mode: {}".format(mode)) 40 | 41 | tei_doc = parse_document_xml(tei_xml) 42 | tei_doc.remove_encumbered() 43 | obj = tei_doc.to_legacy_dict() 44 | 45 | affiliations = [] 46 | for author in obj["authors"]: 47 | if author.get("affiliation"): 48 | affiliations.append(author["affiliation"]) 49 | if affiliations: 50 | # don't duplicate affiliations; only the unique ones 51 | affiliations = list(set([json.dumps(a) for a in affiliations])) 52 | affiliations = [json.loads(a) for a in affiliations] 53 | print("\t".join([sha1hex, json.dumps(affiliations)])) 54 | 55 | 56 | if __name__ == "__main__": 57 | run() 58 | -------------------------------------------------------------------------------- /sql/dump_reingest_spn.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON 7 | ingest_file_result.base_url = ingest_request.base_url 8 | AND ingest_file_result.ingest_type = ingest_request.ingest_type 9 | WHERE 10 | (ingest_request.ingest_type = 'pdf' 11 | OR ingest_request.ingest_type = 'html' 12 | OR ingest_request.ingest_type = 'xml' 13 | OR ingest_request.ingest_type = 'component') 14 | AND ingest_file_result.hit = false 15 | AND ingest_request.created < NOW() - '6 hour'::INTERVAL 16 | AND ingest_request.created > NOW() - '180 day'::INTERVAL 17 | AND ingest_request.ingest_request_source = 'savepapernow-web' 18 | AND ( 19 | ingest_file_result.status like 'spn2-%' 20 | -- OR ingest_file_result.status = 'cdx-error' 21 | -- OR ingest_file_result.status = 'wayback-error' 22 | -- OR ingest_file_result.status = 'wayback-content-error' 23 | OR ingest_file_result.status = 'petabox-error' 24 | -- OR ingest_file_result.status = 'gateway-timeout' 25 | OR ingest_file_result.status = 'no-capture' 26 | ) 27 | AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' 28 | AND ingest_file_result.status != 'spn2-error:filesize-limit' 29 | AND ingest_file_result.status != 'spn2-error:not-found' 30 | AND ingest_file_result.status != 'spn2-error:blocked-url' 31 | AND ingest_file_result.status != 'spn2-error:too-many-redirects' 32 | AND ingest_file_result.status != 'spn2-error:network-authentication-required' 33 | AND ingest_file_result.status != 'spn2-error:unknown' 34 | ) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; 35 | 36 | ROLLBACK; 37 | -------------------------------------------------------------------------------- /extra/nginx/fatcat-blobs: -------------------------------------------------------------------------------- 1 | 2 | server { 3 | listen 80; 4 | listen [::]:80; 5 | listen 443 ssl http2; 6 | listen [::]:443 ssl http2; 7 | server_name blobs.fatcat.wiki; 8 | 9 | ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; 10 | ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; 11 | 12 | #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; 13 | add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 14 | add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 15 | add_header X-Xss-Protection "1"; 16 | # Enable STS with one year period (breaks http; optional) 17 | #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; 18 | 19 | error_log /var/log/nginx/fatcat-errors.log; 20 | access_log /dev/null; 21 | 22 | if ($scheme = http) { 23 | return 301 https://$server_name$request_uri; 24 | } 25 | 26 | location /unpaywall/ { 27 | if ($request_method !~ "GET") { 28 | return 403; 29 | break; 30 | } 31 | 32 | #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args; 33 | proxy_pass http://207.241.227.141:9000$uri$is_args$args; 34 | proxy_redirect off; 35 | 36 | proxy_set_header X-Real-IP $remote_addr; 37 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 38 | proxy_set_header Host $http_host; 39 | } 40 | 41 | location / { 42 | default_type text/plain; 43 | return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead'; 44 | } 45 | 46 | # Let's Encrypt SSL Certs 47 | location /.well-known/acme-challenge/ { 48 | root /var/www/letsencrypt; 49 | autoindex off; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /notes/ingest/2020-03_s2.md: -------------------------------------------------------------------------------- 1 | 2 | Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these 3 | ingested, as well as any previous existing content. 4 | 5 | Also, there are a bunch of PDF outlinks to the web; should do S2-specific 6 | matching and ingest of those. 7 | 8 | There are a few categories of paper from pdfs.s.o: 9 | 10 | 1. we had previous GWB crawl, didn't re-crawl 11 | 2. we had PDF from elsewhere on the web, didn't re-crawl 12 | 3. crawled successfully 13 | 4. crawl failed 14 | 15 | In this ingest, want to get all of categories 1 and 3. Could try to do this by 16 | dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl), 17 | and join that against the ingest request list. 18 | 19 | For other random web URLs, can do the usual persist/backfill/recrawl pipeline. 20 | 21 | ## Create Seedlist 22 | 23 | zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz 24 | zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz 25 | 26 | zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list 27 | zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list 28 | 29 | zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz 30 | zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz 31 | 32 | zcat s2_external_ingestrequest.json.gz | wc -l 33 | 41201427 34 | zcat s2_hosted_ingestrequest.json.gz | wc -l 35 | 23345761 36 | -------------------------------------------------------------------------------- /scalding/README.md: -------------------------------------------------------------------------------- 1 | This directory contains Hadoop map/reduce jobs written in Scala (compiled to 2 | the JVM) using the Scalding framework. Scalding builds on the Java Cascading 3 | library, which itself builds on the Java Hadoop libraries. 4 | 5 | See the other markdown files in this directory for more background and tips. 6 | 7 | ## Dependencies 8 | 9 | To develop locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build 10 | tool, and might need (exactly) Scala version 2.11.8. 11 | 12 | On a debian/ubuntu machine: 13 | 14 | echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list 15 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 16 | sudo apt-get update 17 | sudo apt install scala sbt 18 | 19 | It's also helpful to have a local copy of the `hadoop` binary for running 20 | benchmarks. The `fetch_hadoop.sh` script in the top level directory will fetch 21 | an appropriate version. 22 | 23 | ## Building and Running 24 | 25 | You can run `sbt` commands individually: 26 | 27 | # run all test 28 | sbt test 29 | 30 | # build a jar (also runs tests) 31 | sbt assembly 32 | 33 | Or you can start a session and run commands within that, which is *much* 34 | faster: 35 | 36 | sbt -mem 2048 37 | 38 | sbt> test 39 | sbt> assembly 40 | sbt> testOnly sandcrawler.SomeTestClassName 41 | 42 | On the cluster, you usually use the `please` script to kick off jobs. Be sure 43 | to build the jars first, or pass `--rebuild` to do it automatically. You need 44 | `hadoop` on your path for this. 45 | 46 | ## Troubleshooting 47 | 48 | If your `sbt` task fails with this error: 49 | 50 | java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace 51 | 52 | try restarting `sbt` with more memory (e.g., `sbt -mem 2048`). 53 | 54 | See `scalding-debugging.md` or maybe `../notes/` for more. 55 | -------------------------------------------------------------------------------- /sql/stats/2021-04-08_table_sizes.txt: -------------------------------------------------------------------------------- 1 | 2 | ## SQL Table Sizes 3 | 4 | Size: 467.23G 5 | 6 | SELECT 7 | table_name, 8 | pg_size_pretty(table_size) AS table_size, 9 | pg_size_pretty(indexes_size) AS indexes_size, 10 | pg_size_pretty(total_size) AS total_size 11 | FROM ( 12 | SELECT 13 | table_name, 14 | pg_table_size(table_name) AS table_size, 15 | pg_indexes_size(table_name) AS indexes_size, 16 | pg_total_relation_size(table_name) AS total_size 17 | FROM ( 18 | SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name 19 | FROM information_schema.tables 20 | WHERE table_schema = 'public' 21 | ) AS all_tables 22 | ORDER BY total_size DESC 23 | ) AS pretty_sizes; 24 | 25 | table_name | table_size | indexes_size | total_size 26 | -------------------------------+------------+--------------+------------ 27 | "public"."cdx" | 49 GB | 26 GB | 76 GB 28 | "public"."grobid" | 69 GB | 6834 MB | 75 GB 29 | "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB 30 | "public"."ingest_request" | 39 GB | 32 GB | 70 GB 31 | "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB 32 | "public"."file_meta" | 32 GB | 21 GB | 53 GB 33 | "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB 34 | "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB 35 | "public"."shadow" | 9517 MB | 8026 MB | 17 GB 36 | "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB 37 | "public"."petabox" | 403 MB | 461 MB | 864 MB 38 | "public"."pdftrio" | 550 MB | 297 MB | 847 MB 39 | (12 rows) 40 | 41 | -------------------------------------------------------------------------------- /notes/tasks/2021-09-09_pdf_url_lists.md: -------------------------------------------------------------------------------- 1 | 2 | Want to dump a URL list to share with partners, filtered to content we think is 3 | likely to be scholarly. 4 | 5 | Columns to include: 6 | 7 | - original URL 8 | - capture timestamp 9 | - SHA1 10 | 11 | ## Stats Overview 12 | 13 | file_meta table, mimetype=application/pdf: 173,816,433 14 | 15 | cdx table, mimetype=application/pdf: 131,346,703 16 | 17 | ingest_file_result table, pdf, success: 66,487,928 18 | 19 | ## Ingested PDF URLs 20 | 21 | "Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also? 22 | 23 | COPY ( 24 | SELECT 25 | base_url as start_url, 26 | terminal_url as pdf_url, 27 | terminal_dt as pdf_url_timestamp, 28 | terminal_sha1hex as pdf_sha1hex 29 | FROM ingest_file_result 30 | WHERE 31 | ingest_type = 'pdf' 32 | AND status = 'success' 33 | ) 34 | TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv' 35 | WITH NULL ''; 36 | => 77,892,849 37 | 38 | ## CDX PDFs 39 | 40 | "All web PDFs": CDX query; left join file_meta, but don't require 41 | 42 | COPY ( 43 | SELECT 44 | cdx.url as pdf_url, 45 | cdx.datetime as pdf_url_timestamp, 46 | cdx.sha1hex as pdf_sha1hex 47 | FROM cdx 48 | LEFT JOIN file_meta 49 | ON 50 | cdx.sha1hex = file_meta.sha1hex 51 | WHERE 52 | file_meta.mimetype = 'application/pdf' 53 | OR ( 54 | file_meta.mimetype IS NULL 55 | AND cdx.mimetype = 'application/pdf' 56 | ) 57 | ) 58 | TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv' 59 | WITH NULL ''; 60 | => 147,837,935 61 | 62 | ## Processed web PDFs 63 | 64 | "Parsed web PDFs": `file_meta`, left join CDX 65 | 66 | (didn't do this one) 67 | 68 | --- 69 | 70 | Uploaded all these to 71 | -------------------------------------------------------------------------------- /sql/dump_reingest_old.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON 7 | ingest_file_result.base_url = ingest_request.base_url 8 | AND ingest_file_result.ingest_type = ingest_request.ingest_type 9 | WHERE 10 | ingest_file_result.hit = false 11 | AND ingest_request.created < NOW() - '6 day'::INTERVAL 12 | -- AND ingest_request.created > NOW() - '181 day'::INTERVAL 13 | AND (ingest_request.ingest_request_source = 'fatcat-changelog' 14 | OR ingest_request.ingest_request_source = 'fatcat-ingest' 15 | OR ingest_request.ingest_request_source = 'fatcat-ingest-container' 16 | OR ingest_request.ingest_request_source = 'unpaywall' 17 | OR ingest_request.ingest_request_source = 'arxiv' 18 | OR ingest_request.ingest_request_source = 'pmc' 19 | OR ingest_request.ingest_request_source = 'doaj' 20 | OR ingest_request.ingest_request_source = 'dblp') 21 | AND ( 22 | ingest_file_result.status like 'spn2-%' 23 | -- OR ingest_file_result.status like 'no-capture' 24 | -- OR ingest_file_result.status like 'cdx-error' 25 | -- OR ingest_file_result.status like 'petabox-error' 26 | ) 27 | AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' 28 | AND ingest_file_result.status != 'spn2-error:filesize-limit' 29 | AND ingest_file_result.status != 'spn2-error:not-found' 30 | AND ingest_file_result.status != 'spn2-error:blocked-url' 31 | AND ingest_file_result.status != 'spn2-error:too-many-redirects' 32 | AND ingest_file_result.status != 'spn2-error:network-authentication-required' 33 | AND ingest_file_result.status != 'spn2-error:unknown' 34 | ) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json'; 35 | 36 | ROLLBACK; 37 | -------------------------------------------------------------------------------- /sql/backfill/backfill_grobid_unpaywall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | This is a "one-time" tranform helper script for GROBID backfill into 4 | sandcrawler minio and postgresql. 5 | 6 | This variant of backfill_grobid.py pushes into the unpaywall bucket of 7 | sandcrawler-minio and doesn't push anything to sandcrawler table in general. 8 | """ 9 | 10 | import json, os, sys, collections, io 11 | import base64 12 | import requests 13 | from minio import Minio 14 | import psycopg2 15 | import psycopg2.extras 16 | 17 | 18 | def b32_hex(s): 19 | s = s.strip().split()[0].lower() 20 | if s.startswith("sha1:"): 21 | s = s[5:] 22 | if len(s) != 32: 23 | return s 24 | return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') 25 | 26 | def stdin_to_minio(): 27 | mc = Minio('localhost:9000', 28 | access_key=os.environ['MINIO_ACCESS_KEY'], 29 | secret_key=os.environ['MINIO_SECRET_KEY'], 30 | secure=False) 31 | counts = collections.Counter({'total': 0}) 32 | for l in sys.stdin: 33 | if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: 34 | print("Progress: {}...".format(counts)) 35 | counts['raw_lines'] += 1 36 | l = l.strip() 37 | if not l: 38 | continue 39 | row = json.loads(l) 40 | if not row: 41 | continue 42 | sha1hex = b32_hex(row['pdf_hash']) 43 | grobid_xml = row['tei_xml'].encode('utf-8') 44 | grobid_xml_len = len(grobid_xml) 45 | grobid_xml = io.BytesIO(grobid_xml) 46 | 47 | key = "grobid/{}/{}/{}.tei.xml".format( 48 | sha1hex[0:2], 49 | sha1hex[2:4], 50 | sha1hex) 51 | mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len, 52 | content_type="application/tei+xml", 53 | metadata=None) 54 | counts['minio-success'] += 1 55 | 56 | print("Done: {}".format(counts)) 57 | 58 | if __name__=='__main__': 59 | stdin_to_minio() 60 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.pipe.Pipe 4 | import com.twitter.scalding.Args 5 | import com.twitter.scalding.Stat 6 | import com.twitter.scalding.TypedPipe 7 | import com.twitter.scalding.TypedTsv 8 | import parallelai.spyglass.base.JobBase 9 | 10 | class GroupFatcatWorksSubsetJob(args: Args) extends JobBase(args) { 11 | 12 | val fatcatLhsRowCount = Stat("fatcat-rows-filtered-left", "sandcrawler") 13 | val fatcatRhsRowCount = Stat("fatcat-rows-filtered-right", "sandcrawler") 14 | val joinedRowCount = Stat("joined-rows", "sandcrawler") 15 | 16 | val fatcatScorableLhs : Scorable = new FatcatScorable() 17 | val fatcatPipeLhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableLhs 18 | .getInputPipe(args) 19 | .map { r => 20 | fatcatLhsRowCount.inc 21 | r 22 | } 23 | 24 | val fatcatScorableRhs : Scorable = new FatcatScorableRight() 25 | val fatcatPipeRhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableRhs 26 | .getInputPipe(args) 27 | .map { r => 28 | fatcatRhsRowCount.inc 29 | r 30 | } 31 | 32 | val joinedPipe = fatcatPipeLhs 33 | .addTrap(TypedTsv(args("output") + ".trapped")) 34 | .join(fatcatPipeRhs) 35 | 36 | // TypedTsv doesn't work over case classes. 37 | joinedPipe 38 | // filter out trivial self-matches (releases are identical) 39 | .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => 40 | Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight) 41 | } 42 | .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => 43 | joinedRowCount.inc 44 | new ReduceOutput( 45 | slug, 46 | Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight), 47 | fatcatFeaturesLeft.json, 48 | fatcatFeaturesRight.json) 49 | } 50 | .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } 51 | .write(TypedTsv[(String, Int, String, String)](args("output"))) 52 | } 53 | -------------------------------------------------------------------------------- /scalding/src/test/scala/example/SimpleHBaseSourceExampleTest.scala: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import org.junit.runner.RunWith 4 | import com.twitter.scalding.{JobTest, TupleConversions} 5 | import org.scalatest.FunSpec 6 | import org.scalatest.junit.JUnitRunner 7 | import org.slf4j.LoggerFactory 8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 9 | import cascading.tuple.{Tuple, Fields} 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import scala._ 12 | import com.twitter.scalding.Tsv 13 | import parallelai.spyglass.hbase.HBaseSource 14 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 15 | 16 | /** 17 | * Example of how to define tests for HBaseSource 18 | */ 19 | @RunWith(classOf[JUnitRunner]) 20 | class SimpleHBaseSourceExampleTest extends FunSpec with TupleConversions { 21 | 22 | val output = "/tmp/testOutput" 23 | 24 | val log = LoggerFactory.getLogger(this.getClass.getName) 25 | 26 | val sampleData = List( 27 | List("1", "kk1", "pp1"), 28 | List("2", "kk2", "pp2"), 29 | List("3", "kk3", "pp3") 30 | ) 31 | 32 | JobTest("example.SimpleHBaseSourceExample") 33 | .arg("test", "") 34 | .arg("app.conf.path", "app.conf") 35 | .arg("output", output) 36 | .arg("debug", "true") 37 | .source[Tuple]( 38 | new HBaseSource( 39 | "table_name", 40 | "mtrcs-zk1.us.archive.org:2181", 41 | new Fields("key"), 42 | List("column_family"), 43 | List(new Fields("column_name1", "column_name2")), 44 | sourceMode = SourceMode.GET_LIST, keyList = List("1", "2", "3")), 45 | sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*))) 46 | .sink[Tuple](Tsv(output format "get_list")) { 47 | outputBuffer => 48 | log.debug("Output => " + outputBuffer) 49 | 50 | it("should return the test data provided.") { 51 | println("outputBuffer.size => " + outputBuffer.size) 52 | assert(outputBuffer.size === 3) 53 | } 54 | } 55 | .run 56 | .finish 57 | 58 | } 59 | -------------------------------------------------------------------------------- /extra/nginx/sandcrawler-minio: -------------------------------------------------------------------------------- 1 | 2 | server { 3 | listen 80; 4 | listen [::]:80; 5 | listen 443 ssl http2; 6 | listen [::]:443 ssl http2; 7 | server_name sandcrawler-minio.fatcat.wiki minio.sandcrawler.org; 8 | 9 | ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; 10 | ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; 11 | 12 | #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; 13 | add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 14 | add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 15 | add_header X-Xss-Protection "1"; 16 | # Enable STS with one year period (breaks http; optional) 17 | #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; 18 | 19 | error_log /var/log/nginx/sandcrawler-errors.log; 20 | access_log /dev/null; 21 | 22 | if ($scheme = http) { 23 | return 301 https://$server_name$request_uri; 24 | } 25 | 26 | location /minio/ { 27 | 28 | # allows all HTTP verbs 29 | 30 | proxy_pass http://localhost:9000; 31 | proxy_redirect off; 32 | 33 | proxy_set_header X-Real-IP $remote_addr; 34 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 35 | proxy_set_header Host $http_host; 36 | } 37 | 38 | location / { 39 | if ($request_method !~ "GET") { 40 | return 403; 41 | break; 42 | } 43 | 44 | proxy_pass http://localhost:9000; 45 | proxy_redirect off; 46 | 47 | proxy_set_header X-Real-IP $remote_addr; 48 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 49 | proxy_set_header Host $http_host; 50 | } 51 | 52 | # Let's Encrypt SSL Certs 53 | location /.well-known/acme-challenge/ { 54 | root /var/www/letsencrypt; 55 | autoindex off; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /notes/ingest/2020-02-18_ingest_backfills.md: -------------------------------------------------------------------------------- 1 | 2 | Select: 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url 7 | WHERE ingest_request.ingest_type = 'pdf' 8 | AND ingest_file_result.ingest_type = 'pdf' 9 | AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL 10 | AND ingest_file_result.hit = false 11 | AND ingest_file_result.status like 'spn2-error%' 12 | ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json'; 13 | => COPY 6537 14 | 15 | COPY ( 16 | SELECT row_to_json(ingest_request.*) FROM ingest_request 17 | LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url 18 | WHERE ingest_request.ingest_type = 'pdf' 19 | AND ingest_file_result.ingest_type = 'pdf' 20 | AND ingest_file_result.hit = false 21 | AND ingest_file_result.status like 'wayback-error' 22 | ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json'; 23 | => COPY 33022 24 | 25 | Transform: 26 | 27 | ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json 28 | ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json 29 | 30 | Push to kafka: 31 | 32 | cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 33 | cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 34 | 35 | Many had null `ingest_request_source`, so won't actually import into fatcat: 36 | 37 | bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n 38 | 1 "savepapernow-web" 39 | 112 "fatcat-ingest-container" 40 | 11750 "fatcat-changelog" 41 | 21159 null 42 | 43 | -------------------------------------------------------------------------------- /proposals/schema_changes.sql: -------------------------------------------------------------------------------- 1 | 2 | -- file_meta: more NOT NULL 3 | CREATE TABLE IF NOT EXISTS file_meta ( 4 | sha1hex TEXT NOT NULL PRIMARY KEY CHECK (octet_length(sha1hex) = 40), 5 | sha256hex TEXT NOT NULL CHECK (octet_length(sha256hex) = 64), 6 | md5hex TEXT NOT NULL CHECK (octet_length(md5hex) = 32), 7 | size_bytes BIGINT NOT NULL, 8 | mimetype TEXT CHECK (octet_length(mimetype) >= 1) 9 | ); 10 | 11 | -- CDX: add domain/host columns? 12 | CREATE TABLE IF NOT EXISTS cdx ( 13 | url TEXT NOT NULL CHECK (octet_length(url) >= 1), 14 | datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), 15 | sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), 16 | cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), 17 | mimetype TEXT CHECK (octet_length(mimetype) >= 1), 18 | warc_path TEXT CHECK (octet_length(warc_path) >= 1), 19 | warc_csize BIGINT, 20 | warc_offset BIGINT, 21 | row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, 22 | domain TEXT NOT NULL CHECK (octet_length(domain) >= 1), 23 | host TEXT NOT NULL CHECK (octet_length(host) >= 1), 24 | PRIMARY KEY(url, datetime) 25 | ); 26 | CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); 27 | CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); 28 | 29 | -- direct fast import with just md5hex; big UPDATE via join with file_meta 30 | CREATE TABLE IF NOT EXISTS shadow ( 31 | shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), 32 | shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), 33 | sha1hex TEXT CHECK (octet_length(sha1hex) = 40), 34 | md5hex TEXT CHECK (octet_length(md5hex) = 32), 35 | doi TEXT CHECK (octet_length(doi) >= 1), 36 | pmid TEXT CHECK (octet_length(pmid) >= 1), 37 | isbn13 TEXT CHECK (octet_length(isbn13) >= 1), 38 | PRIMARY KEY(shadow_corpus, shadow_id) 39 | ); 40 | CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); 41 | -------------------------------------------------------------------------------- /notes/ingest/2022-07-19_dblp.md: -------------------------------------------------------------------------------- 1 | 2 | Cross-posting from fatcat bulk metadata update/ingest. 3 | 4 | zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 5 | # 631k 0:00:11 [54.0k/s] 6 | 7 | 8 | ## Post-Crawl Stats 9 | 10 | This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run 11 | 2022-09-06: 12 | 13 | 14 | SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) 15 | FROM ingest_request 16 | LEFT JOIN ingest_file_result 17 | ON ingest_file_result.ingest_type = ingest_request.ingest_type 18 | AND ingest_file_result.base_url = ingest_request.base_url 19 | WHERE 20 | ingest_request.link_source = 'dblp' 21 | GROUP BY ingest_request.ingest_type, status 22 | -- ORDER BY ingest_request.ingest_type, COUNT DESC 23 | ORDER BY COUNT DESC 24 | LIMIT 30; 25 | 26 | 27 | ingest_type | status | count 28 | -------------+-----------------------+-------- 29 | pdf | success | 305142 30 | pdf | no-pdf-link | 192683 31 | pdf | no-capture | 42634 32 | pdf | terminal-bad-status | 38041 33 | pdf | skip-url-blocklist | 31055 34 | pdf | link-loop | 9263 35 | pdf | wrong-mimetype | 4545 36 | pdf | redirect-loop | 3952 37 | pdf | empty-blob | 2705 38 | pdf | wayback-content-error | 834 39 | pdf | wayback-error | 294 40 | pdf | petabox-error | 202 41 | pdf | blocked-cookie | 155 42 | pdf | cdx-error | 115 43 | pdf | body-too-large | 66 44 | pdf | bad-redirect | 19 45 | pdf | timeout | 7 46 | pdf | bad-gzip-encoding | 4 47 | (18 rows) 48 | 49 | That is quite a lot of `no-pdf-link`, might be worth doing a random sample 50 | and/or re-ingest. And a chunk of `no-capture` to retry. 51 | -------------------------------------------------------------------------------- /notes/petabox_ia_metadata.txt: -------------------------------------------------------------------------------- 1 | 2 | Ran in aitio:/schnell/iamine-journals in December 2018. 3 | 4 | Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018 5 | 6 | Commands: 7 | 8 | # didn't work! 9 | #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist 10 | 11 | # fetched manually via metamgr, using prefix matches 12 | cat metamgr-* > metamgr-journals-loose.20181218.items 13 | 14 | ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json 15 | 16 | export LC_ALL=C 17 | cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv 18 | 19 | Size/results: 20 | 21 | bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items 22 | 2043877 journals-ia.20181218.json 23 | 2044362 metamgr-journals-loose.20181218.items 24 | 25 | # missed about 500; meh 26 | 27 | -rw-rw-r-- 1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json 28 | 29 | bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 30 | 1748645 journals-ia.20181218.pdf-sha1.tsv 31 | 32 | ## June 2019 Ingest 33 | 34 | bnewbold@ia601101$ pwd 35 | /schnell/iamine-journals 36 | 37 | zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json 38 | zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json 39 | zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json 40 | zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json 41 | 42 | cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json 43 | cat jstor.json | ./ia_pdf_match.py > jstor.match.json 44 | cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json 45 | cat pmc.json | ./ia_pdf_match.py > pmc.match.json 46 | 47 | bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json 48 | 1076012 arxiv.json 49 | 740970 arxiv.match.json 50 | 451204 jstor.json 51 | 451204 jstor.match.json 52 | 77838 paper-doi.json 53 | 23736 paper-doi.match.json 54 | 209787 pmc.json 55 | 189093 pmc.match.json 56 | 57 | -------------------------------------------------------------------------------- /python/sandcrawler/fileset_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Any, Dict, List, Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | class IngestStrategy(str, Enum): 8 | WebFile = "web-file" 9 | WebFileset = "web-fileset" 10 | WebFilesetBundled = "web-fileset-bundled" 11 | ArchiveorgFile = "archiveorg-file" 12 | ArchiveorgFileset = "archiveorg-fileset" 13 | ArchiveorgFilesetBundled = "archiveorg-fileset-bundled" 14 | 15 | 16 | class FilesetManifestFile(BaseModel): 17 | path: str 18 | size: Optional[int] 19 | md5: Optional[str] 20 | sha1: Optional[str] 21 | sha256: Optional[str] 22 | mimetype: Optional[str] 23 | extra: Optional[Dict[str, Any]] 24 | 25 | status: Optional[str] 26 | platform_url: Optional[str] 27 | terminal_url: Optional[str] 28 | terminal_dt: Optional[str] 29 | 30 | 31 | class FilesetPlatformItem(BaseModel): 32 | platform_name: str 33 | platform_status: str 34 | platform_domain: Optional[str] 35 | platform_id: Optional[str] 36 | manifest: Optional[List[FilesetManifestFile]] 37 | 38 | archiveorg_item_name: Optional[str] 39 | archiveorg_item_meta: Optional[dict] 40 | web_base_url: Optional[str] 41 | web_bundle_url: Optional[str] 42 | 43 | 44 | class ArchiveStrategyResult(BaseModel): 45 | ingest_strategy: str 46 | status: str 47 | manifest: List[FilesetManifestFile] 48 | file_file_meta: Optional[Dict[str, Any]] 49 | file_resource: Optional[Any] 50 | bundle_file_meta: Optional[Dict[str, Any]] 51 | bundle_resource: Optional[Any] 52 | bundle_archiveorg_path: Optional[str] 53 | 54 | 55 | class PlatformScopeError(Exception): 56 | """ 57 | For incidents where platform helper discovers that the fileset/dataset is 58 | out-of-cope after already starting to process it. 59 | 60 | For example, attempting to ingest: 61 | 62 | - a 'latest version' record, when the platform has version-specific records 63 | - a single file within a dataset for a platform which has file-level identifiers 64 | """ 65 | 66 | pass 67 | 68 | 69 | class PlatformRestrictedError(Exception): 70 | """ 71 | When datasets are not publicly available on a platform (yet) 72 | """ 73 | 74 | pass 75 | -------------------------------------------------------------------------------- /sql/dump_reingest_weekly.sql: -------------------------------------------------------------------------------- 1 | 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; 3 | 4 | COPY ( 5 | SELECT row_to_json(ingest_request.*) FROM ingest_request 6 | LEFT JOIN ingest_file_result ON 7 | ingest_file_result.base_url = ingest_request.base_url 8 | AND ingest_file_result.ingest_type = ingest_request.ingest_type 9 | WHERE 10 | (ingest_request.ingest_type = 'pdf' 11 | OR ingest_request.ingest_type = 'html' 12 | OR ingest_request.ingest_type = 'xml' 13 | OR ingest_request.ingest_type = 'component') 14 | AND ingest_file_result.hit = false 15 | AND ingest_request.created < NOW() - '8 hour'::INTERVAL 16 | AND ingest_request.created > NOW() - '8 day'::INTERVAL 17 | AND (ingest_request.ingest_request_source = 'fatcat-changelog' 18 | OR ingest_request.ingest_request_source = 'fatcat-ingest' 19 | OR ingest_request.ingest_request_source = 'fatcat-ingest-container') 20 | AND ( 21 | ingest_file_result.status like 'spn2-%' 22 | -- OR ingest_file_result.status = 'cdx-error' 23 | -- OR ingest_file_result.status = 'wayback-error' 24 | -- OR ingest_file_result.status = 'wayback-content-error' 25 | OR ingest_file_result.status = 'petabox-error' 26 | -- OR ingest_file_result.status = 'gateway-timeout' 27 | OR ingest_file_result.status = 'no-capture' 28 | ) 29 | AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' 30 | AND ingest_file_result.status != 'spn2-error:filesize-limit' 31 | AND ingest_file_result.status != 'spn2-error:not-found' 32 | AND ingest_file_result.status != 'spn2-error:blocked-url' 33 | AND ingest_file_result.status != 'spn2-error:too-many-redirects' 34 | AND ingest_file_result.status != 'spn2-error:network-authentication-required' 35 | AND ingest_file_result.status != 'spn2-error:unknown' 36 | ) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; 37 | 38 | -- bulk re-tries would be: 39 | -- AND (ingest_request.ingest_request_source != 'fatcat-changelog' 40 | -- AND ingest_request.ingest_request_source != 'fatcat-ingest') 41 | 42 | ROLLBACK; 43 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import parallelai.spyglass.base.JobBase 10 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 11 | import parallelai.spyglass.hbase.HBasePipeConversions 12 | import parallelai.spyglass.hbase.HBaseSource 13 | 14 | // This nasty, no-good, horrible Job outputs a list of keys ("sha1:A234...") 15 | // for which the given "column" does not have a value set. 16 | // It does this using a self-join because SpyGlass's HBase SCAN support seems 17 | // to be extremely limited. 18 | class MissingColumnDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions { 19 | 20 | val output = args("output") 21 | 22 | val allKeys : TypedPipe[String] = MissingColumnDumpJob.getHBaseKeySource( 23 | args("hbase-table"), 24 | args("zookeeper-hosts")) 25 | .read 26 | .fromBytesWritable('key) 27 | .toTypedPipe[String]('key) 28 | 29 | val existingKeys : TypedPipe[(String,Boolean)] = MissingColumnDumpJob.getHBaseColSource( 30 | args("hbase-table"), 31 | args("zookeeper-hosts"), 32 | args("column")) 33 | .read 34 | .fromBytesWritable('key) 35 | .toTypedPipe[String]('key) 36 | .map{ key => (key, true) } 37 | 38 | val missingKeys : TypedPipe[String] = allKeys 39 | .groupBy( identity ) 40 | .leftJoin(existingKeys.groupBy(_._1)) 41 | .toTypedPipe 42 | .collect { case (key, (_, None)) => key } 43 | 44 | missingKeys 45 | .write(TypedTsv[String](output)) 46 | 47 | } 48 | 49 | object MissingColumnDumpJob { 50 | 51 | // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181" 52 | def getHBaseColSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = { 53 | HBaseBuilder.build( 54 | hbaseTable, 55 | zookeeperHosts, 56 | List(col), 57 | SourceMode.SCAN_ALL) 58 | } 59 | 60 | def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = { 61 | HBaseBuilder.build( 62 | hbaseTable, 63 | zookeeperHosts, 64 | List("f:c"), 65 | SourceMode.SCAN_ALL) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /sql/pdftrio_queries.md: -------------------------------------------------------------------------------- 1 | 2 | ## Counts / Status 3 | 4 | SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code; 5 | 6 | # NOTE: I earlier deleted a large fraction of non-200 status codes, so 7 | # these aren't representative 8 | status_code | count 9 | -------------+--------- 10 | -4 | 16 11 | -2 | 26 12 | 200 | 1117501 13 | 400 | 2695 14 | (4 rows) 15 | 16 | 17 | SELECT status, COUNT(*) FROM pdftrio GROUP BY status; 18 | 19 | status | count 20 | ---------------+--------- 21 | error | 2696 22 | error-connect | 26 23 | error-timeout | 16 24 | success | 1118252 25 | (4 rows) 26 | 27 | SELECT 28 | COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count, 29 | COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count, 30 | COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count, 31 | COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count 32 | FROM pdftrio; 33 | 34 | 35 | ensemble_count | linear_count | bert_count | image_count 36 | ----------------+--------------+------------+------------- 37 | 1120100 | 976271 | 66209 | 143829 38 | (1 row) 39 | 40 | ## Histograms 41 | 42 | SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio 43 | WHERE status = 'success' 44 | AND ensemble_score IS NOT NULL 45 | GROUP BY buckets 46 | ORDER BY buckets; 47 | 48 | SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio 49 | WHERE status = 'success' 50 | AND bert_score IS NOT NULL 51 | GROUP BY buckets 52 | ORDER BY buckets; 53 | 54 | SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio 55 | WHERE status = 'success' 56 | AND linear_score IS NOT NULL 57 | GROUP BY buckets 58 | ORDER BY buckets; 59 | 60 | SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio 61 | WHERE status = 'success' 62 | AND image_score IS NOT NULL 63 | GROUP BY buckets 64 | ORDER BY buckets; 65 | 66 | -------------------------------------------------------------------------------- /extra/RUNBOOK.md: -------------------------------------------------------------------------------- 1 | 2 | ## Process Un-GROBID-ed PDFs from Wayback 3 | 4 | Sometimes ingest doesn't pick up everything, or we do some heuristic CDX 5 | import, and we want to run GROBID over all the PDFs that haven't been processed 6 | yet. Only want one CDX line per `sha1hex`. 7 | 8 | A hybrid SQL/UNIX way of generating processing list: 9 | 10 | psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json 11 | 12 | From here, there are two options: enqueue in Kafka and let workers run, or 13 | create job files and run them using local worker and GNU/parallel. 14 | 15 | #### Kafka 16 | 17 | Copy/transfer to a Kafka node; load a sample and then the whole output: 18 | 19 | head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 20 | cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 21 | 22 | #### Local JSON 23 | 24 | Older example; if this fails, need to re-run entire thing: 25 | 26 | cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - 27 | 28 | TODO: is it possible to use job log with millions of `--pipe` inputs? That 29 | would be more efficient in the event of failure. 30 | 31 | ## GROBID over many .zip files 32 | 33 | Want to use GNU/Parallel in a mode that will do retries well: 34 | 35 | fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ 36 | sort | \ 37 | parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ 38 | './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' 39 | 40 | After starting, check that messages are actually getting pushed to kafka 41 | (producer failures can be silent!). If anything goes wrong, run the exact same 42 | command again. The sort is to ensure jobs are enqueued in the same order again; 43 | could also dump `fd` output to a command file first. 44 | 45 | -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import cascading.tuple.Fields 4 | import cascading.tuple.Tuple 5 | import com.twitter.scalding.JobTest 6 | import com.twitter.scalding.Tsv 7 | import com.twitter.scalding.TupleConversions 8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 9 | import org.apache.hadoop.hbase.util.Bytes 10 | import org.junit.runner.RunWith 11 | import org.scalatest.FunSpec 12 | import org.scalatest.junit.JUnitRunner 13 | import org.slf4j.LoggerFactory 14 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 15 | import parallelai.spyglass.hbase.HBaseSource 16 | import scala._ 17 | 18 | /** 19 | * Example of how to define tests for HBaseSource 20 | */ 21 | @RunWith(classOf[JUnitRunner]) 22 | class HBaseRowCountTest extends FunSpec with TupleConversions { 23 | 24 | val output = "/tmp/testOutput" 25 | val (testTable, testHost) = ("test-table", "dummy-host:2181") 26 | 27 | val log = LoggerFactory.getLogger(this.getClass.getName) 28 | 29 | val sampleData = List( 30 | List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "a", "b"), 31 | List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b"), 32 | List("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "a", "b"), 33 | List("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "a", "b"), 34 | List("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", "a", "b"), 35 | List("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", "a", "b"), 36 | List("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", "a", "b"), 37 | List("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", "a", "b") 38 | ) 39 | 40 | JobTest("sandcrawler.HBaseRowCountJob") 41 | .arg("test", "") 42 | .arg("app.conf.path", "app.conf") 43 | .arg("output", output) 44 | .arg("hbase-table", testTable) 45 | .arg("zookeeper-hosts", testHost) 46 | .arg("debug", "true") 47 | .source[Tuple](HBaseRowCountJob.getHBaseSource(testTable, testHost), 48 | sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*))) 49 | .sink[Tuple](Tsv(output)) { 50 | outputBuffer => 51 | 52 | it("should return the test data provided.") { 53 | assert(outputBuffer.size === 1) 54 | } 55 | 56 | it("should return the correct count") { 57 | assert(outputBuffer(0).getObject(0) === 8) 58 | } 59 | } 60 | .run 61 | .finish 62 | 63 | } 64 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.util.Properties 4 | 5 | import cascading.property.AppProps 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import parallelai.spyglass.base.JobBase 10 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 11 | import parallelai.spyglass.hbase.HBasePipeConversions 12 | import parallelai.spyglass.hbase.HBaseSource 13 | 14 | // Filters for HBase rows which have not had GROBID run on them, but do have 15 | // full CDX metadata, and dumps to a TSV for later extraction by the 16 | // "extraction-ungrobided" job. 17 | // 18 | // Does the same horrible join thing that DumpUnGrobidedJob does. 19 | class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions { 20 | 21 | val output = args("output") 22 | 23 | val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource( 24 | args("hbase-table"), 25 | args("zookeeper-hosts")) 26 | .read 27 | .fromBytesWritable('key, 'c, 'mime, 'cdx) 28 | .toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx) 29 | 30 | val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource( 31 | args("hbase-table"), 32 | args("zookeeper-hosts")) 33 | .read 34 | .fromBytesWritable('key) 35 | .toTypedPipe[String]('key) 36 | .map{ key => (key, true) } 37 | 38 | val missingKeys : TypedPipe[(String,String,String,String)] = allKeys 39 | .groupBy(_._1) 40 | .leftJoin(existingKeys.groupBy(_._1)) 41 | .toTypedPipe 42 | .collect { case (key, ((_, c, mime, cdx), None)) => (key, c, mime, cdx) } 43 | 44 | missingKeys 45 | .write(TypedTsv[(String,String,String,String)](output)) 46 | 47 | } 48 | 49 | object DumpUnGrobidedJob { 50 | 51 | // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181" 52 | def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = { 53 | HBaseBuilder.build( 54 | hbaseTable, 55 | zookeeperHosts, 56 | List("grobid0:status_code"), 57 | SourceMode.SCAN_ALL) 58 | } 59 | 60 | def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = { 61 | HBaseBuilder.build( 62 | hbaseTable, 63 | zookeeperHosts, 64 | List("f:c", "file:mime", "file:cdx"), 65 | SourceMode.SCAN_ALL) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala: -------------------------------------------------------------------------------- 1 | 2 | package sandcrawler 3 | 4 | import cascading.flow.FlowDef 5 | import cascading.pipe.Pipe 6 | import cascading.tuple.Fields 7 | import com.twitter.scalding._ 8 | import com.twitter.scalding.typed.TDsl._ 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.util.Bytes 11 | import parallelai.spyglass.base.JobBase 12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode 13 | import parallelai.spyglass.hbase.HBasePipeConversions 14 | import parallelai.spyglass.hbase.HBaseSource 15 | 16 | class GrobidScorableDumpJob(args: Args) extends JobBase(args) { 17 | 18 | val grobidHbaseRows = Stat("hbase-rows-scanned", "hbase-grobid-dump") 19 | val filteredGrobidRows = Stat("grobid-rows-filtered", "hbase-grobid-dump") 20 | val parsedGrobidRows = Stat("grobid-rows-parsed", "hbase-grobid-dump") 21 | val validGrobidRows = Stat("grobid-rows-valid-slug", "hbase-grobid-dump") 22 | 23 | val pipe = GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts")) 24 | .read 25 | // Can't just "fromBytesWritable" because we have multiple types? 26 | .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code")) 27 | .filter { case (_, metadata, status_code) => 28 | grobidHbaseRows.inc 29 | metadata != null && status_code != null 30 | } 31 | .map { case (key, metadata, status_code) => 32 | (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes())) 33 | } 34 | // TODO: Should I combine next two stages for efficiency? 35 | .collect { case (key, json, 200) => 36 | filteredGrobidRows.inc 37 | (key, json) 38 | } 39 | .map { entry : (String, String) => 40 | parsedGrobidRows.inc 41 | GrobidScorable.jsonToMapFeatures(entry._1, entry._2) 42 | } 43 | .filterNot { entry => entry.isEmpty } 44 | .map { entry => { 45 | validGrobidRows.inc 46 | entry.get 47 | }} 48 | .groupBy { case MapFeatures(slug, json) => slug } 49 | .map { tuple => 50 | val (slug : String, features : MapFeatures) = tuple 51 | (slug, ReduceFeatures(features.json)) 52 | } 53 | 54 | pipe 55 | .map { case (slug, features) => 56 | (slug, features.json) 57 | } 58 | .write(TypedTsv[(String, String)](args("output"))) 59 | } 60 | -------------------------------------------------------------------------------- /kafka/debugging_issues.txt: -------------------------------------------------------------------------------- 1 | 2 | ## 2020-11-12 3 | 4 | To reset a consumer group to the offsets from a specific date (or datetime), 5 | use: 6 | 7 | ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-grobid-s3 --reset-offsets --all-topics --to-datetime 2020-11-09T00:00:00.000 8 | 9 | Add `--execute` to actually commit the change. 10 | 11 | ## 2018-12-02 12 | 13 | Had been having some troubles with consumer group partition assignments with 14 | the grobid-output group and grobid-hbase-insert consumer group. Tried deleting 15 | and re-creating, which was probbaly a mistake. Also tried to use kafka-broker 16 | shell scripts to cleanup/debug and didn't work well. 17 | 18 | In the end, after re-building the topic, decided to create a new consumer group 19 | (grobid-hbase-insert2) to get rid of history/crap. Might need to do this again 20 | in the future, oh well. 21 | 22 | A few things learned: 23 | 24 | - whatever pykafka "native python" is producing to consumer group offsets 25 | doesn't work great with kafka-manager or the shell scripts: consumer instance 26 | names don't show. this is an error in shell scripts, and blank/red in 27 | kafka-manager 28 | - restarting kafka-manager takes a while (for it to refresh data?) and it shows 29 | inconsistent stuff during that period, but it does result in cleaned up 30 | consumer group cached metadata (aka, old groups are cleared) 31 | - kafka-manager can't fetch JXM info, either due to lack of config or port 32 | blocking. should try to fix this for metrics etc 33 | - it would be nice to be using recent librdkafka everywhere. pykafka can 34 | optionally use this, and many other tools do automatically. however, this is 35 | a system package, and xenial doesn't have backports (debian stretch does). 36 | the version in bionic looks "good enough", so many should try that? 37 | - there has been a minor release of kafka (2.1) since I installed (!) 38 | - the burrow (consumer group monitoring) tool is packaged for some version of 39 | ubuntu 40 | 41 | In general, not feally great about the current setup. Very frustrating that the 42 | debug/status tools are broken with pykafka native output. Need to at least 43 | document things a lot better. 44 | 45 | Separately, came up with an idea to do batched processing with GROBID: don't 46 | auto-commit, instead consume a batch (10? or until block), process those, then 47 | commit. This being a way to get "the batch size returned". 48 | 49 | -------------------------------------------------------------------------------- /notes/tasks/2020-08-20_file_meta.md: -------------------------------------------------------------------------------- 1 | 2 | Want to update fatcat file entities with "full" file metadata for those which are missing it. 3 | 4 | How many `file_meta` rows *still* don't have metadata? 5 | 6 | SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; 7 | => 62962 8 | 9 | First generate list of sha1hex from most recent bulk export which are missing 10 | at least some metadata (based on missing sha256): 11 | 12 | zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv 13 | => 18.7M 0:05:46 [53.8k/s] 14 | 15 | Then dump the entire sandcrawler `file_meta` table as TSV, with first column 16 | sha1hex and second column JSON with all the file metadata fields: 17 | 18 | COPY ( 19 | SELECT sha1hex, row_to_json(file_meta) 20 | FROM file_meta 21 | WHERE sha256hex IS NOT NULL 22 | ORDER BY sha1hex ASC 23 | ) 24 | TO '/grande/snapshots/file_meta_dump.tsv' 25 | WITH NULL ''; 26 | 27 | Join/cut: 28 | 29 | export LC_ALL=C 30 | join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json 31 | => 18.1M 0:03:37 [83.2k/s] 32 | 33 | Check counts: 34 | 35 | cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l 36 | => 18135313 37 | 38 | zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr 39 | 18103860 application/pdf 40 | 29977 application/octet-stream 41 | 876 text/html 42 | 199 application/postscript 43 | 171 application/gzip 44 | 84 text/plain 45 | 48 application/xml 46 | 38 application/vnd.ms-powerpoint 47 | 16 application/msword 48 | 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document 49 | 6 image/jpeg 50 | 4 message/rfc822 51 | 4 application/zip 52 | 4 application/vnd.openxmlformats-officedocument.presentationml.presentation 53 | 3 text/x-tex 54 | 3 application/x-dosexec 55 | 2 application/x-tar 56 | 2 application/vnd.ms-tnef 57 | 1 video/mpeg 58 | 1 image/tiff 59 | 1 image/svg+xml 60 | 1 image/png 61 | 1 image/gif 62 | 1 audio/x-ape 63 | 1 application/vnd.ms-office 64 | 1 application/CDFV2-unknown 65 | 66 | TODO: fatcat importer 67 | -------------------------------------------------------------------------------- /scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala: -------------------------------------------------------------------------------- 1 | package sandcrawler 2 | 3 | import java.io.InputStream 4 | 5 | import scala.io.Source 6 | 7 | import org.scalatest._ 8 | 9 | // scalastyle:off null 10 | class ScorableFeaturesTest extends FlatSpec with Matchers { 11 | "toMapFeatures()" should "work with gnarly inputs" in { 12 | ScorableFeatures.create(title = null).toMapFeatures 13 | ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures 14 | } 15 | 16 | private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug 17 | 18 | "mapToSlug()" should "extract the parts of titles before a colon" in { 19 | titleToSlug("HELLO:there") shouldBe Some("hellothere") 20 | } 21 | 22 | it should "extract an entire colon-less string" in { 23 | titleToSlug("hello THERE") shouldBe Some("hellothere") 24 | } 25 | 26 | it should "return Scorable.NoSlug if given empty string" in { 27 | titleToSlug("") shouldBe (None) 28 | } 29 | 30 | it should "return Scorable.NoSlug if given null" in { 31 | titleToSlug(null) shouldBe (None) 32 | } 33 | 34 | it should "strip punctuation" in { 35 | titleToSlug("HELLO!:the:re") shouldBe Some("hellothere") 36 | titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh") 37 | titleToSlug( 38 | "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands") 39 | titleToSlug(":;\"\'") shouldBe (None) 40 | } 41 | 42 | it should "filter stub titles" in { 43 | titleToSlug("abstract") shouldBe (None) 44 | titleToSlug("title!") shouldBe (None) 45 | titleToSlug("a real title which is not on denylist") shouldBe Some("arealtitlewhichisnotondenylist") 46 | } 47 | 48 | it should "strip special characters" in { 49 | titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None) 50 | // TODO: titleToSlug("©™₨№…") shouldBe (None) 51 | // TODO: titleToSlug("πµΣσ") shouldBe (None) 52 | } 53 | 54 | it should "remove whitespace" in { 55 | titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz") 56 | titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi") 57 | titleToSlug("\n \t \r ") shouldBe (None) 58 | } 59 | 60 | it should "skip very short slugs" in { 61 | titleToSlug("short") shouldBe (None) 62 | titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle") 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /python/tests/test_pdfextract.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | import poppler 4 | import pytest 5 | from test_wayback import cdx_client, wayback_client # noqa:F401 6 | 7 | from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker 8 | from sandcrawler.pdfextract import process_pdf 9 | 10 | FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) 11 | 12 | 13 | def test_process_fake_pdf(): 14 | resp = process_pdf(FAKE_PDF_BYTES) 15 | print(resp) 16 | assert resp.status == "not-pdf" 17 | 18 | with open("tests/files/dummy_zip.zip", "rb") as f: 19 | pdf_bytes = f.read() 20 | resp = process_pdf(pdf_bytes) 21 | assert resp.status == "not-pdf" 22 | 23 | 24 | @pytest.mark.skipif( 25 | poppler.version_string() == "0.71.0", reason="unsupported version of poppler" 26 | ) 27 | def test_process_dummy_pdf(): 28 | with open("tests/files/dummy.pdf", "rb") as f: 29 | pdf_bytes = f.read() 30 | resp = process_pdf(pdf_bytes) 31 | assert resp.status == "success" 32 | assert resp.page0_thumbnail is not None 33 | assert len(resp.text) > 10 34 | assert resp.meta_xml is None 35 | assert resp.file_meta["mimetype"] == "application/pdf" 36 | print(resp.pdf_info) 37 | print(resp.pdf_extra) 38 | assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis" 39 | # 595 x 842 40 | assert resp.pdf_extra["page0_height"] == 842 41 | assert resp.pdf_extra["page0_width"] == 595 42 | assert resp.pdf_extra["page_count"] == 1 43 | 44 | 45 | def test_pdfextract_worker_cdx(wayback_client): # noqa: F811 46 | 47 | sink = BlackholeSink() 48 | worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) 49 | 50 | with open("tests/files/example.cdx", "r") as cdx_file: 51 | pusher = CdxLinePusher( 52 | worker, 53 | cdx_file, 54 | filter_http_statuses=[200, 226], 55 | filter_mimetypes=["application/pdf"], 56 | ) 57 | pusher_counts = pusher.run() 58 | assert pusher_counts["total"] 59 | assert pusher_counts["pushed"] == 7 60 | assert pusher_counts["pushed"] == worker.counts["total"] 61 | 62 | 63 | def test_pdfextract_blob_worker(): 64 | 65 | sink = BlackholeSink() 66 | worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) 67 | 68 | with open("tests/files/dummy.pdf", "rb") as f: 69 | pdf_bytes = f.read() 70 | 71 | worker.process(pdf_bytes) 72 | -------------------------------------------------------------------------------- /python/scripts/covid2ingestrequest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Transform an unpaywall dump (JSON) into ingest requests. 4 | """ 5 | 6 | import argparse 7 | import json 8 | import sys 9 | 10 | import urlcanon 11 | 12 | 13 | def canon(s): 14 | parsed = urlcanon.parse_url(s) 15 | return str(urlcanon.whatwg(parsed)) 16 | 17 | 18 | def transform_cnki(obj): 19 | 20 | requests = [] 21 | assert obj["cnki_id"] 22 | 23 | requests = [] 24 | requests.append( 25 | { 26 | "base_url": canon(obj["info_url"]), 27 | "ingest_type": "pdf", 28 | "link_source": "cnki_covid19", 29 | "link_source_id": obj["cnki_id"], 30 | "ingest_request_source": "scrape-covid19", 31 | } 32 | ) 33 | if "read_url" in obj: 34 | requests.append( 35 | { 36 | "base_url": canon(obj["read_url"]), 37 | "ingest_type": "pdf", # actually HTML 38 | "link_source": "cnki_covid19", 39 | "link_source_id": obj["cnki_id"], 40 | "ingest_request_source": "scrape-covid19", 41 | } 42 | ) 43 | 44 | return requests 45 | 46 | 47 | def transform_wanfang(obj): 48 | 49 | assert obj["wanfang_id"] 50 | return [ 51 | { 52 | "base_url": canon(obj["url"]), 53 | "ingest_type": "pdf", 54 | "link_source": "wanfang_covid19", 55 | "link_source_id": obj["wanfang_id"], 56 | "ingest_request_source": "scrape-covid19", 57 | } 58 | ] 59 | 60 | 61 | def run(args): 62 | for l in args.json_file: 63 | if not l.strip(): 64 | continue 65 | row = json.loads(l) 66 | 67 | if "wanfang_id" in row: 68 | requests = transform_wanfang(row) or [] 69 | elif "cnki_id" in row: 70 | requests = transform_cnki(row) or [] 71 | else: 72 | continue 73 | for r in requests: 74 | print("{}".format(json.dumps(r, sort_keys=True))) 75 | 76 | 77 | def main(): 78 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 79 | parser.add_argument( 80 | "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r") 81 | ) 82 | subparsers = parser.add_subparsers() 83 | 84 | args = parser.parse_args() 85 | 86 | run(args) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | 2 | Note: as of 2022 this file is ancient and need review 3 | 4 | ## Kafka Pipelines 5 | 6 | - after network split, mass restarting import/harvest stuff seemed to 7 | completely reset consumergroups (!). bunch of LeaderNotFoundError 8 | => change/update consumer group config 9 | => ensure we are recording timestamps to allow timestamp-based resets 10 | - refactor python kafka clients (slack convo with kenji+dvd) 11 | => try librdkafka? 12 | => switch to python-kafka? 13 | - monitoring/alerting of consumergroup offsets 14 | => start with crude python script? 15 | - document: need to restart all consumers after brokers restart 16 | - operate on batches, using threads/async, and reduce worker (process) counts 17 | dramatically 18 | 19 | source of kafka-manager weirdness? 20 | Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: org.apache.kafka.common.protocol.types.SchemaException: Error reading field 'user_data': java.nio.BufferUnderflowException 21 | Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: [error] k.m.a.c.KafkaManagedOffsetCache - Failed to get member metadata from group summary and member summary : grobid-hbase-insert : MemberSummary(pykafka-8128e0be-4952-4e79-8644-a52987421259,pykafka,/207.241.225.228,[B@6c368f37,[B@2b007e01) 22 | 23 | ## Other 24 | 25 | - paper match heuristic: include 10.1007%2F978-3-319-49304-6_18 (URL-escaped slash) 26 | - catch EOFFail fetching from wayback 27 | - "author counts match" in scoring 28 | - refactor "scorable" to "matchable" 29 | - look at refactoring to reduce JSON serializations 30 | - QA tool for matches (PDF + Crossref JSON + landing page?) 31 | => python; talks directly to HBase 32 | - author counts should match (+/- one?) 33 | 34 | match strategies (hbase columns): 35 | - legacy_doi 36 | - url_doi 37 | - grobid_crossref (doi) 38 | - grobid_fatcat (fatcat ID) 39 | 40 | scalding: 41 | - better JSON library 42 | - less verbose sbt test output (set log level to WARN) 43 | - auto-formatting: addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.6.0-RC3") 44 | 45 | pig: 46 | - potentially want to *not* de-dupe CDX lines by uniq sha1 in all cases; run 47 | this as a second-stage filter? for example, may want many URL links in fatcat 48 | for a single file (different links, different policies) 49 | - fix pig gitlab-ci tests (JAVA_HOME) 50 | 51 | python: 52 | - include input file name (and chunk? and CDX?) in sentry context 53 | - how to get argument (like --hbase-table) into mrjob.conf, or similar? 54 | -------------------------------------------------------------------------------- /extra/blobs/tasks.md: -------------------------------------------------------------------------------- 1 | 2 | ## Backfill GROBID XML to Blob Store 3 | 4 | Initially ran this when spinning up new seaweedfs server to replace minio. At 5 | this time grobid persist worker was in db-only mode, as minio was too slow to 6 | accept uploads. Rough plan is to: 7 | 8 | 1. run grobid persist worker from Kafka with a new temporary consumer group, 9 | from the start of the GROBID output topic 10 | 2. when it gets to end, stop the *regular* consumer group while this one is 11 | still running. with temporary worker still running, at that point in time 12 | entire topic should be in S3 13 | 3. then reconfigure regular worker to db+s3 mode. halt the temporary worker, 14 | restart the regular one with new config, run it indefinitely 15 | 16 | Consumer group isn't an arg, so just edit `persist_worker.py` and set it to 17 | `persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode 18 | didn't try to connect to postgresql. 19 | 20 | Commands: 21 | 22 | ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only 23 | => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed 24 | => run briefly, then kill 25 | 26 | On kafka-broker worker: 27 | 28 | ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run 29 | 30 | Then run 2x instances of worker (same command as above): 31 | 32 | ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only 33 | 34 | At this point CPU-limited on this worker by the python processes (only 4 cores 35 | on this machine). 36 | 37 | Check in weed shell: 38 | 39 | weed shell 40 | 41 | > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml 42 | [...] 43 | "isGzipped": false 44 | [...] 45 | "mime": "application/xml", 46 | [...] 47 | 48 | An open question is if we should have separate buckets per derive type. Eg, a 49 | GROBID XML bucket separate from thumbnails bucket. Or are prefix directories 50 | enough. Basically this comes down to whether we want things mixed together at 51 | the volume level. I think we should keep separate. 52 | 53 | Need to set the mimetype in the upload for gzip on XML? 54 | --------------------------------------------------------------------------------