├── scalding
    ├── ia_cluster.conf
    ├── project
    │   ├── build.properties
    │   ├── Dependencies.scala
    │   └── plugins.sbt
    ├── .gitignore
    ├── src
    │   ├── main
    │   │   └── scala
    │   │   │   ├── sandcrawler
    │   │   │       ├── HBaseMimeCountJob.scala
    │   │   │       ├── HBaseRowCountJob.scala
    │   │   │       ├── HBaseColCountJob.scala
    │   │   │       ├── HBaseStatusCountJob.scala
    │   │   │       ├── HBaseStatusCodeCountJob.scala
    │   │   │       ├── HBaseCountJob.scala
    │   │   │       ├── MatchBenchmarkJob.scala
    │   │   │       ├── DumpGrobidStatusCodeJob.scala
    │   │   │       ├── DumpFileMetaJob.scala
    │   │   │       ├── GroupFatcatWorksJob.scala
    │   │   │       ├── BibjsonScorable.scala
    │   │   │       ├── DumpGrobidXmlJob.scala
    │   │   │       ├── ScoreJob.scala
    │   │   │       ├── DumpGrobidMetaInsertableJob.scala
    │   │   │       ├── GroupFatcatWorksSubsetJob.scala
    │   │   │       ├── MissingColumnDumpJob.scala
    │   │   │       ├── DumpUnGrobidedJob.scala
    │   │   │       └── GrobidScorableDumpJob.scala
    │   │   │   └── example
    │   │   │       ├── WordCountJob.scala
    │   │   │       └── SimpleHBaseSourceExample.scala
    │   └── test
    │   │   └── scala
    │   │       ├── example
    │   │           ├── WordCountTest.scala
    │   │           └── SimpleHBaseSourceExampleTest.scala
    │   │       └── sandcrawler
    │   │           ├── HBaseBuilderTest.scala
    │   │           ├── HBaseRowCountTest.scala
    │   │           └── ScorableFeaturesTest.scala
    └── README.md
├── pig
    ├── .gitignore
    ├── pytest.ini
    ├── tests
    │   ├── pig.properties
    │   ├── files
    │   │   ├── example.sha1b32
    │   │   ├── papers_url_doi.cdx
    │   │   ├── tarballs.cdx
    │   │   ├── sourcecode.cdx
    │   │   ├── papers_domain_words.cdx
    │   │   └── papers_edu_tilde.cdx
    │   ├── test_filter_cdx.py
    │   ├── log4j.properties
    │   ├── test_filter_software.py
    │   ├── test_filter_cdx_paper_pdfs.py
    │   └── test_join_cdx.py
    ├── Pipfile
    ├── hbase-count-rows.pig
    ├── filter-cdx-pdfs.pig
    ├── filter-cdx-ps.pig
    ├── README.md
    ├── filter-cdx-tarball.pig
    ├── filter-cdx-source-code-crude.pig
    ├── filter-cdx-join-urls.pig
    ├── join-cdx-sha1.pig
    └── filter-cdx-paper-pdfs.pig
├── notes
    ├── ingest
    │   ├── .gitignore
    │   ├── 2019-10-23_testing.md
    │   ├── 2020-05_pubmed.md
    │   ├── 2020-11-04_arxiv.md
    │   ├── 2020-09_scielo.md
    │   ├── es_csv_to_json.py
    │   ├── 2020-03-oa_but_not_marked.md
    │   ├── 2023-10_dimensions.md
    │   ├── 2020-01-14_bulk.md
    │   ├── 2022-03_oaipmh.md
    │   ├── NEXT.md
    │   ├── 2020-03_s2.md
    │   ├── 2020-02-18_ingest_backfills.md
    │   └── 2022-07-19_dblp.md
    ├── hbase_table_sizes.txt
    ├── library_shopping.txt
    ├── dryad_datasets.md
    ├── examples
    │   ├── random_datasets.md
    │   └── dataset_examples.txt
    ├── backfill_scalding_rewrite.txt
    ├── crawl_cdx_merge.md
    ├── possible_ingest_targets.txt
    ├── tasks
    │   ├── 2022-01-07_grobid_platform_pdfs.md
    │   ├── 2020-01-27_cleanup_cdx.md
    │   ├── 2020-01-06_heuristic_cdx.txt
    │   ├── 2021-09-09_pdf_url_lists.md
    │   └── 2020-08-20_file_meta.md
    ├── match_filter_enrich.txt
    ├── old_extract_results.txt
    └── petabox_ia_metadata.txt
├── sql
    ├── sandcrawler_schema.sql
    ├── example.env
    ├── dump_file_meta.sql
    ├── migrations
    │   ├── 2019-12-19-060141_init
    │   │   └── down.sql
    │   └── 00000000000000_diesel_initial_setup
    │   │   ├── down.sql
    │   │   └── up.sql
    ├── dump_regrobid_pdf.sql
    ├── dump_unmatched_glutton_pdf.sql
    ├── dump_unextracted_pdf_petabox.sql
    ├── dump_regrobid_pdf_petabox.sql
    ├── reingest_spn.sh
    ├── backfill
    │   ├── petabox_transform.py
    │   ├── backfill_file_meta.py
    │   └── backfill_grobid_unpaywall.py
    ├── dump_ungrobid_pdf_petabox.sql
    ├── dump_ungrobid_pdf.sql
    ├── reingest_old.sh
    ├── reingest_bulk.sh
    ├── reingest_terminalstatus_forcerecrawl.sh
    ├── dump_unextracted_pdf.sql
    ├── table_sizes.md
    ├── reingest_weekly.sh
    ├── reingest_quarterly.sh
    ├── stats
    │   ├── 2021-11-01_table_sizes.txt
    │   ├── 2021-12-02_table_sizes.txt
    │   ├── 2020-01-31_supplement.txt
    │   ├── 2022-11-23_table_sizes.txt
    │   └── 2021-04-08_table_sizes.txt
    ├── Makefile
    ├── dump_reingest_bulk.sql
    ├── dump_reingest_terminalstatus.sql
    ├── dump_reingest_spn.sql
    ├── dump_reingest_old.sql
    ├── dump_reingest_weekly.sql
    └── pdftrio_queries.md
├── python
    ├── title_slug_denylist.txt
    ├── .coveragerc
    ├── tests
    │   ├── files
    │   │   ├── dummy.pdf
    │   │   ├── dummy_zip.zip
    │   │   ├── scielo_article.jats.xml
    │   │   ├── genders_g58_fairlie.html
    │   │   └── small.json
    │   ├── test_html.py
    │   ├── test_html_ingest.py
    │   ├── test_ingest_html.py
    │   ├── test_xml.py
    │   ├── test_grobid2json.py
    │   ├── test_pushers.py
    │   └── test_pdfextract.py
    ├── pyproject.toml
    ├── .gitignore
    ├── sandcrawler
    │   ├── xml.py
    │   ├── __init__.py
    │   └── fileset_types.py
    ├── example.env
    ├── .pylintrc
    ├── .flake8
    ├── Makefile
    ├── pytest.ini
    ├── scripts
    │   ├── pdf_thumbnail.py
    │   ├── enrich_scored_matches.py
    │   ├── ingestrequest_row2json.py
    │   ├── manifest_converter.py
    │   ├── grobid_affiliations.py
    │   └── covid2ingestrequest.py
    ├── Pipfile
    └── README.md
├── CONTRIBUTORS
├── extra
    ├── blobs
    │   ├── seaweedfs
    │   │   └── README.md
    │   ├── minio
    │   │   └── minio.conf
    │   └── tasks.md
    ├── docker
    │   ├── README.md
    │   └── docker-compose.yml
    ├── nginx
    │   ├── README.md
    │   ├── fatcat-blobs
    │   └── sandcrawler-minio
    ├── hbase
    │   └── howto.md
    └── RUNBOOK.md
├── kafka
    ├── monitoring_commands.md
    ├── howto_rebalance.md
    └── debugging_issues.txt
├── proposals
    ├── brainstorm
    │   ├── 2021-debug_web_interface.md
    │   └── 2022-04-18_automated_heritrix_crawling.md
    ├── 2021-09-21_spn_accounts.md
    ├── 2021-09-13_src_ingest.md
    ├── 20201012_no_capture.md
    └── schema_changes.sql
├── .gitignore
├── python_hadoop
    ├── mrjob.conf
    ├── tests
    │   ├── test_grobid2json.py
    │   └── files
    │   │   └── small.json
    └── Pipfile
├── match_test_data
    ├── NOTES.txt
    └── RESULTS.txt
├── Dockerfile.sandcrawler-pytest
├── fetch_hadoop.sh
├── .gitlab-ci.yml
└── TODO


/scalding/ia_cluster.conf:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pig/.gitignore:
--------------------------------------------------------------------------------
1 | deps
2 | *.log
3 | 


--------------------------------------------------------------------------------
/notes/ingest/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.json
3 | 


--------------------------------------------------------------------------------
/pig/pytest.ini:
--------------------------------------------------------------------------------
1 | 
2 | [pytest]
3 | norecursedirs = deps
4 | 


--------------------------------------------------------------------------------
/scalding/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.1
2 | 


--------------------------------------------------------------------------------
/sql/sandcrawler_schema.sql:
--------------------------------------------------------------------------------
1 | migrations/2019-12-19-060141_init/up.sql


--------------------------------------------------------------------------------
/scalding/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | project/project/
3 | project/targer/
4 | 


--------------------------------------------------------------------------------
/python/title_slug_denylist.txt:
--------------------------------------------------------------------------------
1 | ../scalding/src/main/resources/slug-denylist.txt


--------------------------------------------------------------------------------
/python/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = tests/*
3 | source =
4 |     sandcrawler
5 | 


--------------------------------------------------------------------------------
/sql/example.env:
--------------------------------------------------------------------------------
1 | DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler"
2 | 


--------------------------------------------------------------------------------
/pig/tests/pig.properties:
--------------------------------------------------------------------------------
1 | log4jconf=./tests/log4j.properties
2 | stop.on.failure=true
3 | 


--------------------------------------------------------------------------------
/python/tests/files/dummy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy.pdf


--------------------------------------------------------------------------------
/python/tests/files/dummy_zip.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/dummy_zip.zip


--------------------------------------------------------------------------------
/python/tests/files/scielo_article.jats.xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/scielo_article.jats.xml


--------------------------------------------------------------------------------
/python/tests/files/genders_g58_fairlie.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/sandcrawler/HEAD/python/tests/files/genders_g58_fairlie.html


--------------------------------------------------------------------------------
/scalding/project/Dependencies.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | 
3 | object Dependencies {
4 |   lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.5"
5 | }
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | Bryan Newbold
2 | 
3 | Ellen Spertus transfers copyright of all of her contributions to the
4 | repository in exchange for one Internet Archive Sticker, received.


--------------------------------------------------------------------------------
/pig/tests/files/example.sha1b32:
--------------------------------------------------------------------------------
1 | EJWYVOPONJRARK7SGG6COFRN7CSTHROY
2 | V32E3CCO7NMI2M4OHLKG73DXD72LR4B2
3 | 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
4 | E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK
5 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta:__legacy__"
4 | 
5 | [tool.isort]
6 | profile = "black"
7 | line_length = 96
8 | 


--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
 1 | *part-000*
 2 | *.tar.gz
 3 | *.gz
 4 | htmlcov/
 5 | samples/
 6 | *.json
 7 | TODO*
 8 | *.tsv
 9 | 
10 | !.flake8
11 | !.gitlab-ci.yml
12 | !.pylintrc
13 | !.coveragerc
14 | !.gitignore
15 | 


--------------------------------------------------------------------------------
/python/tests/test_html.py:
--------------------------------------------------------------------------------
1 | from sandcrawler.html import extract_fulltext_url
2 | 
3 | 
4 | def test_extract_fulltext_url():
5 | 
6 |     resp = extract_fulltext_url("asdf", b"asdf")
7 |     assert resp == {}
8 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseMimeCountJob.scala:
--------------------------------------------------------------------------------
1 | package sandcrawler
2 | 
3 | import com.twitter.scalding.Args
4 | 
5 | class HBaseMimeCountJob(args: Args) extends HBaseCountJob(args, "file:mime") {}
6 | 
7 | 


--------------------------------------------------------------------------------
/extra/blobs/seaweedfs/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## HOWTO: Create new bucket in SeaweedFS
 3 | 
 4 | Log in to the seaweedfs VM.
 5 | 
 6 | Run `weed shell` to start a shell, then:
 7 | 
 8 |     bucket.create -name <bucket>
 9 | 
10 | 


--------------------------------------------------------------------------------
/scalding/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0")
3 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
4 | 


--------------------------------------------------------------------------------
/python/sandcrawler/xml.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | 
3 | 
4 | def xml_reserialize(raw: bytes) -> str:
5 |     root = ET.fromstring(raw)
6 |     return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
7 | 


--------------------------------------------------------------------------------
/notes/hbase_table_sizes.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | As of 2018-05-29:
 3 | - qa rows:   1,246,013
 4 | - prod rows: 8,974,188
 5 | 
 6 | As of 2018-06-16:
 7 | - qa:    1,246,013
 8 | - prod: 18,308,086
 9 | 
10 | As of 2018-08-01:
11 | - qa:    1,246,013
12 | - prod: 18,308,141
13 | 


--------------------------------------------------------------------------------
/pig/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | 
 3 | url = "https://pypi.python.org/simple"
 4 | verify_ssl = true
 5 | name = "pypi"
 6 | 
 7 | 
 8 | [dev-packages]
 9 | 
10 | 
11 | 
12 | [packages]
13 | 
14 | pytest = "*"
15 | 
16 | 
17 | [requires]
18 | 
19 | python_version = "3.5"
20 | 


--------------------------------------------------------------------------------
/pig/tests/test_filter_cdx.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | from pighelper import PigTestHelper
 5 | 
 6 | class TestFilterCDX(PigTestHelper):
 7 | 
 8 |     def test_thing(self):
 9 |         self.run_pig("filter-cdx-ps.pig", "tests/files/example.cdx")
10 | 


--------------------------------------------------------------------------------
/notes/ingest/2019-10-23_testing.md:
--------------------------------------------------------------------------------
1 | 
2 | exported not-archived DOIs for elife, as well as general list.
3 | 
4 |     wc -l recent\ missing\ oa\ releases.csv
5 |     161828 recent missing oa releases.csv
6 | 
7 |     wc -l missing\ elife\ DOIs.csv
8 |     1779 missing elife DOIs.csv
9 | 


--------------------------------------------------------------------------------
/python/example.env:
--------------------------------------------------------------------------------
1 | SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
2 | SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
3 | IA_ACCESS_KEY="dummy"
4 | IA_SECRET_KEY="dummy"
5 | CDX_AUTH_TOKEN="dummy"
6 | PETABOX_WEBDATA_SECRET="dummy"
7 | SENTRY_DSN=""
8 | SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
9 | 


--------------------------------------------------------------------------------
/sql/dump_file_meta.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |   SELECT sha1hex, row_to_json(file_meta)
 6 |   FROM file_meta
 7 |   ORDER BY sha1hex ASC
 8 | )
 9 | TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
10 | WITH NULL '';
11 | 
12 | ROLLBACK;
13 | 


--------------------------------------------------------------------------------
/sql/migrations/2019-12-19-060141_init/down.sql:
--------------------------------------------------------------------------------
1 | 
2 | DROP TABLE IF NOT EXISTS cdx;
3 | DROP TABLE IF NOT EXISTS file_meta;
4 | DROP TABLE IF NOT EXISTS fatcat_file;
5 | DROP TABLE IF NOT EXISTS petabox;
6 | DROP TABLE IF NOT EXISTS grobid;
7 | DROP TABLE IF NOT EXISTS ingest_request;
8 | DROP TABLE IF NOT EXISTS shadow;
9 | 


--------------------------------------------------------------------------------
/python/tests/test_html_ingest.py:
--------------------------------------------------------------------------------
 1 | from sandcrawler.ingest_html import *
 2 | 
 3 | 
 4 | def test_html_extract_ojs3() -> None:
 5 | 
 6 |     with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
 7 |         ojs3_html = f.read()
 8 | 
 9 |     fulltext = html_extract_body_teixml(ojs3_html)
10 |     assert fulltext["status"] == "success"
11 | 


--------------------------------------------------------------------------------
/kafka/monitoring_commands.md:
--------------------------------------------------------------------------------
1 | 
2 |     kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.status, .base_url]' -c
3 | 
4 |     kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.request.ingest_request_source, .status, .request.base_url, .terminal.terminal_url]' -c
5 | 


--------------------------------------------------------------------------------
/sql/migrations/00000000000000_diesel_initial_setup/down.sql:
--------------------------------------------------------------------------------
1 | -- This file was automatically created by Diesel to setup helper functions
2 | -- and other internal bookkeeping. This file is safe to edit, any future
3 | -- changes will be added to existing projects as new migrations.
4 | 
5 | DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
6 | DROP FUNCTION IF EXISTS diesel_set_updated_at();
7 | 


--------------------------------------------------------------------------------
/notes/library_shopping.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | potential helpers:
 3 | - https://github.com/martinblech/xmltodict
 4 | - https://github.com/trananhkma/fucking-awesome-python#text-processing
 5 | - https://github.com/blaze/blaze (for catalog/analytics)
 6 | - validation: https://github.com/pyeve/cerberus
 7 | - testing (to replace nose):
 8 |     - https://github.com/CleanCut/green
 9 |     - pytest
10 |     - mamba ("behavior driven")
11 | 


--------------------------------------------------------------------------------
/proposals/brainstorm/2021-debug_web_interface.md:
--------------------------------------------------------------------------------
 1 | 
 2 | status: brainstorm idea
 3 | 
 4 | Simple internal-only web interface to help debug ingest issues.
 5 | 
 6 | - paste a hash, URL, or identifier and get a display of "everything we know" about it
 7 | - enter a URL/SURT prefix and get aggregate stats (?)
 8 | - enter a domain/host/prefix and get recent attempts/results
 9 | - pre-computed periodic reports on ingest pipeline (?)
10 | 


--------------------------------------------------------------------------------
/python/tests/test_ingest_html.py:
--------------------------------------------------------------------------------
 1 | from sandcrawler.ingest_html import html_guess_platform
 2 | 
 3 | from selectolax.parser import HTMLParser
 4 | 
 5 | def test_html_guess_platform_no_icon_href() -> None:
 6 |     with open("tests/files/plos_one_article_no_icon_href.html", "r") as f:
 7 |         plos_html = f.read()
 8 |     parsed = HTMLParser(plos_html)
 9 |     result = html_guess_platform("", parsed)
10 |     assert result == None
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *venv.zip
 2 | mapreduce-*.tar.gz
 3 | *,cover
 4 | htmlcov/
 5 | python/venv-current.tar.gz
 6 | *.test
 7 | 
 8 | *.o
 9 | *.a
10 | *.pyc
11 | #*#
12 | *~
13 | *.swp
14 | .*
15 | *.tmp
16 | *.old
17 | *.profile
18 | *.bkp
19 | *.bak
20 | [Tt]humbs.db
21 | *.DS_Store
22 | build/
23 | _build/
24 | src/build/
25 | *.log
26 | 
27 | !.coveragerc
28 | !.gitlab-ci.yml
29 | !.pylintrc
30 | 
31 | # Don't ignore this file itself
32 | !.gitignore
33 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/example/WordCountJob.scala:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import com.twitter.scalding._
 4 | 
 5 | class WordCountJob(args: Args) extends Job(args) {
 6 |   TypedPipe.from(TextLine(args("input")))
 7 |     .flatMap { line => line.split("\\s+") }
 8 |     .map { word => (word, 1L) }
 9 |     .sumByKey
10 |     // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
11 |     .write(TypedTsv[(String, Long)](args("output")))
12 | }
13 | 


--------------------------------------------------------------------------------
/pig/tests/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=WARN, stdout
 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 3 | log4j.appender.stdout.Target=System.out
 4 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 5 | log4j.appender.stdout.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
 6 | 
 7 | # With these enabled, get "log4j:ERROR Attempted to append to closed appender named [stdout]"
 8 | #log4j.logger.org.apache.pig=WARN, stdout
 9 | #log4j.logger.org.apache.hadoop = WARN, stdout
10 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-05_pubmed.md:
--------------------------------------------------------------------------------
 1 | 
 2 | From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1.
 3 | 
 4 | Test small batch:
 5 | 
 6 |     zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
 7 | 
 8 | Run the whole batch:
 9 | 
10 |     zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
11 | 


--------------------------------------------------------------------------------
/python/tests/test_xml.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sandcrawler.xml import xml_reserialize
 4 | 
 5 | 
 6 | def test_xml_reserialize() -> None:
 7 | 
 8 |     with open("tests/files/scielo_article.jats.xml", "rb") as f:
 9 |         raw_xml = f.read()
10 | 
11 |     assert b'encoding="ISO-8859-1"' in raw_xml
12 |     raw_xml.decode("ISO-8859-1")
13 |     with pytest.raises(UnicodeDecodeError):
14 |         raw_xml.decode("utf-8")
15 | 
16 |     str_xml = xml_reserialize(raw_xml)
17 |     assert 'encoding="UTF-8"' in str_xml
18 | 


--------------------------------------------------------------------------------
/pig/tests/files/papers_url_doi.cdx:
--------------------------------------------------------------------------------
1 | #http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1
2 | #http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607
3 | 
4 | # should match 2:
5 | 
6 | org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
7 | org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
8 | 


--------------------------------------------------------------------------------
/pig/tests/test_filter_software.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | from pighelper import PigTestHelper, count_lines
 5 | 
 6 | 
 7 | class TestFilterCDXSoftware(PigTestHelper):
 8 | 
 9 |     def test_tarballs(self):
10 |         r = self.run_pig("filter-cdx-tarball.pig", "tests/files/tarballs.cdx")
11 |         assert count_lines(r) == 2
12 | 
13 |     def test_source_code(self):
14 |         r = self.run_pig("filter-cdx-source-code-crude.pig", "tests/files/sourcecode.cdx")
15 |         assert count_lines(r) == 1
16 | 
17 | 


--------------------------------------------------------------------------------
/sql/dump_regrobid_pdf.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |     SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx
 9 |     WHERE cdx.mimetype = 'application/pdf'
10 |     AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
11 | )
12 | TO STDOUT
13 | WITH NULL '';
14 | 
15 | ROLLBACK;
16 | 


--------------------------------------------------------------------------------
/python_hadoop/mrjob.conf:
--------------------------------------------------------------------------------
 1 | runners:
 2 |   local:
 3 |     upload_files:
 4 |       - common.py
 5 |       - grobid2json.py
 6 |     setup:
 7 |       - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/
 8 |   hadoop:
 9 |     no_output: true
10 |     upload_files:
11 |       - common.py
12 |       - grobid2json.py
13 |     setup:
14 |       - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/
15 |     cmdenv:
16 |       SENTRY_DSN: https://6ab6ad080d034280b863f294e07cc5c6:414ebf0b68634f669d2dc00d7c935699@books-sentry.us.archive.org/9
17 | 


--------------------------------------------------------------------------------
/extra/docker/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | The docker-compose script in this directory may be helpful for local
 3 | development. It starts Kafka, postgrest, and zookeeper.
 4 | 
 5 | PostgreSQL is assumed to be running natively on localhost, not under docker. It
 6 | should be possible to add postgresql to the docker-compose file, but some
 7 | developers (bnewbold) prefer to run it separately to make things like attaching
 8 | with `psql` easier.
 9 | 
10 | There is no current motivation or plan to deploy sandcrawler services using
11 | docker, so there is no Dockerfile for the system itself.
12 | 


--------------------------------------------------------------------------------
/sql/dump_unmatched_glutton_pdf.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < THING.sql > THING.2019-09-23.json
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |   SELECT row_to_json(grobid)
 9 |   FROM grobid
10 |   LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
11 |   WHERE fatcat_file.sha1hex IS NULL
12 |   AND grobid.fatcat_release IS NOT NULL
13 |   LIMIT 1000
14 | )
15 | TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
16 | --TO STDOUT
17 | --WITH NULL '';
18 | 
19 | ROLLBACK;
20 | 


--------------------------------------------------------------------------------
/extra/nginx/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | This folder contains nginx configs for partner access to sandcrawler DB
 3 | (postgrest) and GROBID XML blobs (minio).
 4 | 
 5 | `fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to
 6 | show how it works.
 7 | 
 8 | ## Let's Encrypt
 9 | 
10 | As... bnewbold?
11 | 
12 |     sudo certbot certonly \
13 |         --non-interactive \
14 |         --agree-tos \
15 |         --email bnewbold@archive.org \
16 |         --webroot -w /var/www/letsencrypt \
17 |             -d sandcrawler-minio.fatcat.wiki \
18 |             -d sandcrawler-db.fatcat.wiki
19 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-11-04_arxiv.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run
 3 | a crawl.
 4 | 
 5 | Crawl is now done, so going to ingest, hoping to get the majority of the
 6 | millions of remaining arxiv.org PDFs.
 7 | 
 8 |     zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l
 9 |     => 1,288,559
10 | 
11 |     zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
12 | 
13 | 


--------------------------------------------------------------------------------
/sql/dump_unextracted_pdf_petabox.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_unextracted_pdf_petabox.sql
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |   SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
 9 |   FROM grobid
10 |   LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
11 |   LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
12 |   WHERE petabox.sha1hex IS NOT NULL
13 |     AND pdf_meta.sha1hex IS NULL
14 | )
15 | TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
16 | WITH NULL '';
17 | 
18 | ROLLBACK;
19 | 


--------------------------------------------------------------------------------
/python/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | # TODO: should re-enable some of these
 3 | disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck,unused-wildcard-import,no-member,cyclic-import,too-few-public-methods,wildcard-import,too-many-locals,too-many-ancestors,unused-import
 4 | 
 5 | [REPORTS]
 6 | output-format=colorized
 7 | include-ids=yes
 8 | 
 9 | [MISCELLANEOUS]
10 | # List of note tags to take in consideration, separated by a comma.
11 | notes=FIXME,XXX,DELETEME
12 | 
13 | [TYPECHECK]
14 | extension-pkg-whitelist=selectolax,pydantic,responses
15 | 


--------------------------------------------------------------------------------
/python_hadoop/tests/test_grobid2json.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import xml
 3 | import json
 4 | import pytest
 5 | from grobid2json import *
 6 | 
 7 | 
 8 | def test_small_xml():
 9 |     
10 |     with open('tests/files/small.xml', 'r') as f:
11 |         tei_xml = f.read()
12 |     with open('tests/files/small.json', 'r') as f:
13 |         json_form  = json.loads(f.read())
14 | 
15 |     assert teixml2json(tei_xml) == json_form
16 | 
17 | def test_invalid_xml():
18 | 
19 |     with pytest.raises(xml.etree.ElementTree.ParseError):
20 |         teixml2json("this is not XML")
21 |     with pytest.raises(ValueError):
22 |         teixml2json("<xml></xml>")
23 | 


--------------------------------------------------------------------------------
/extra/blobs/minio/minio.conf:
--------------------------------------------------------------------------------
 1 | 
 2 | # Volume to be used for MinIO server.
 3 | MINIO_VOLUMES="/sandcrawler-minio/data"
 4 | # Use if you want to run MinIO on a custom port.
 5 | MINIO_OPTS="--address :9000"
 6 | # Access Key of the server.
 7 | MINIO_ACCESS_KEY=REDACTED
 8 | # Secret key of the server.
 9 | MINIO_SECRET_KEY=REDACTED
10 | 
11 | # may need to set these manually using `mc admin config get`, edit the JSON, then `set`
12 | MINIO_COMPRESS="on"
13 | MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv"
14 | MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream"
15 | 


--------------------------------------------------------------------------------
/sql/dump_regrobid_pdf_petabox.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_regrobid_pdf_petabox.sql
 4 | --   cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
 5 | 
 6 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 7 | 
 8 | COPY (
 9 |     SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
10 |     WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
11 | )
12 | TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
13 | WITH NULL '';
14 | 
15 | ROLLBACK;
16 | 


--------------------------------------------------------------------------------
/sql/reingest_spn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | set -o pipefail     # fail if part of a '|' command fails
 6 | 
 7 | sudo -u postgres psql sandcrawler < dump_reingest_spn.sql
 8 | 
 9 | cd ../python
10 | sudo -u sandcrawler pipenv run \
11 |     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \
12 |     > /srv/sandcrawler/tasks/reingest_spn.json
13 | 
14 | cat /srv/sandcrawler/tasks/reingest_spn.json \
15 |     | shuf \
16 |     | head -n60000 \
17 |     | jq . -c \
18 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
19 | 
20 | 


--------------------------------------------------------------------------------
/python_hadoop/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "ia"
 3 | url = "https://devpi.archive.org/wb/prod"
 4 | verify_ssl = true
 5 | 
 6 | [[source]]
 7 | name = "pypi"
 8 | url = "https://pypi.python.org/simple"
 9 | verify_ssl = true
10 | 
11 | [dev-packages]
12 | ipython = "*"
13 | happybase-mock = "*"
14 | pytest = "*"
15 | pytest-pythonpath = "*"
16 | responses = "*"
17 | pytest-cov = "*"
18 | pylint = "*"
19 | 
20 | [packages]
21 | globalwayback = {version=">=0.3", index="ia"}
22 | happybase = "*"
23 | mrjob = "*"
24 | requests = "*"
25 | wayback = {version=">=0.2.1.2", index="ia"}
26 | xmltodict = "*"
27 | raven = "*"
28 | pykafka = "*"
29 | python-snappy = "*"
30 | boto3 = "*"
31 | 
32 | [requires]
33 | python_version = "3.5"
34 | 


--------------------------------------------------------------------------------
/match_test_data/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Converted older .tsv from pdf-extraction comparison work with:
 3 | 
 4 |     cat 1k_random_identified_combined.tsv | jq -c --slurp --raw-input --raw-output 'split("\n") | .[:-1] | map(split("\t")) | map({"doi": .[0], "title": .[1], "authors": ( .[2] | split(";") ), "year": .[3], "journal": .[4], "publisher": .[5], "subject": .[6], "type": .[7], "sha": .[8]}) | .[]' > crossref_sample.bibjson
 5 | 
 6 | Note that neither bibjson file is a superset of the either:
 7 | 
 8 |   944 unique SHA1 which exist in both lists
 9 |   964 in crossref_sample.sha1
10 |   979 in grobid_sample.sha1
11 | 
12 | So scoring should be on a basis of "out of 944 lines". If this is confusing we
13 | can trim the files down.
14 | 


--------------------------------------------------------------------------------
/sql/backfill/petabox_transform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json, sys, os
 4 | 
 5 | for l in sys.stdin.readlines():
 6 |     l = l.strip()
 7 |     if not l:
 8 |         continue
 9 |     r = json.loads(l)
10 |     if not r['sha1']:
11 |         continue
12 |     sha1hex = r['sha1']
13 |     for url in r['urls']:
14 |         u = url['url']
15 |         if not '//archive.org/' in u:
16 |             continue
17 |         u = u.split('/')
18 |         if u[2] == 'web.archive.org':
19 |             continue
20 |         #print(u)
21 |         assert u[2] == 'archive.org' and u[3] in ('download', 'serve')
22 |         item = u[4]
23 |         path = '/'.join(u[5:])
24 |         print("\t".join([item, path, sha1hex]))
25 | 


--------------------------------------------------------------------------------
/sql/dump_ungrobid_pdf_petabox.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_ungrobid_pdf_petabox.sql
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |   SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
 9 |   FROM petabox
10 |   WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
11 |   -- uncomment/comment this to control whether only fatcat files are included
12 |   AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
13 | )
14 | TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
15 | WITH NULL '';
16 | 
17 | ROLLBACK;
18 | 


--------------------------------------------------------------------------------
/sql/dump_ungrobid_pdf.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_ungrobid_pdf.sql
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |   SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
 9 |   FROM cdx
10 |   WHERE cdx.mimetype = 'application/pdf'
11 |   AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
12 |   -- uncomment/comment this to control whether only fatcat files are included
13 |   --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
14 | )
15 | TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
16 | WITH NULL '';
17 | 
18 | ROLLBACK;
19 | 


--------------------------------------------------------------------------------
/sql/reingest_old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | set -o pipefail     # fail if part of a '|' command fails
 6 | 
 7 | sudo -u postgres psql sandcrawler < dump_reingest_old.sql
 8 | 
 9 | cd ../python
10 | sudo -u sandcrawler pipenv run \
11 |     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \
12 |     > /srv/sandcrawler/tasks/reingest_old_current.json
13 | 
14 | cat /srv/sandcrawler/tasks/reingest_old_current.json \
15 |     | shuf \
16 |     | head -n250000 \
17 |     | jq . -c \
18 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
19 | 
20 | 


--------------------------------------------------------------------------------
/pig/hbase-count-rows.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | REGISTER /usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar
 3 | REGISTER /usr/lib/hbase/lib/hbase-common-0.98.6-cdh5.3.1.jar
 4 | REGISTER /usr/lib/hbase/lib/hbase-hadoop2-compat-0.98.6-cdh5.3.1.jar
 5 | REGISTER /usr/lib/hbase/lib/hbase-protocol-0.98.6-cdh5.3.1.jar
 6 | 
 7 | set hbase.zookeeper.quorum 'mtrcs-zk1.us.archive.org,mtrcs-zk2.us.archive.org,mtrcs-zk3.us.archive.org'
 8 | 
 9 | data = LOAD 'hbase://wbgrp-journal-extract-0-qa'
10 |        USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('grobid0:status_code', '-loadKey true')
11 |        AS (key:CHARARRAY, status:CHARARRAY);
12 | 
13 | data_group = GROUP data ALL;
14 | data_count = FOREACH data_group GENERATE COUNT(data);
15 | DUMP data_count;
16 | 


--------------------------------------------------------------------------------
/proposals/2021-09-21_spn_accounts.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Formalization of SPNv2 API requests from fatcat/sandcrawler
 3 | 
 4 | Create two new system accounts, one for regular/daily ingest requests, one for
 5 | priority requests (save-paper-now or as a flag with things like fatcat-ingest;
 6 | "interactive"). These accounts should have @archive.org emails. Request the
 7 | daily one to have the current rate limit as bnewbold@archive.org account; the
 8 | priority queue can have less.
 9 | 
10 | Create new ingest kafka queues from scratch, one for priority and one for
11 | regular. Chose sizes carefully, probably keep 24x for the regular and do 6x or
12 | so (small) for priority queue.
13 | 
14 | Deploy new priority workers; reconfigure/deploy broadly.
15 | 


--------------------------------------------------------------------------------
/sql/reingest_bulk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | set -o pipefail     # fail if part of a '|' command fails
 6 | 
 7 | sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql
 8 | 
 9 | cd ../python
10 | sudo -u sandcrawler pipenv run \
11 |     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \
12 |     > /srv/sandcrawler/tasks/reingest_bulk_current.json
13 | 
14 | cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
15 |     | shuf \
16 |     | head -n1000000 \
17 |     | jq . -c \
18 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
19 | 
20 | 


--------------------------------------------------------------------------------
/notes/dryad_datasets.md:
--------------------------------------------------------------------------------
 1 | 
 2 | api docs: https://datadryad.org/api/v2/docs
 3 | 
 4 | current search queries return 38,000 hits (December 2020)
 5 | 
 6 | exmaple with multiple versions:
 7 |     https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
 8 |     https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
 9 |     https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
10 | 
11 | 
12 | how to handle versions? DOI doesn't get incremented.
13 | 
14 | on archive.org, could have separate item for each version, or sub-directories within item, one for each version
15 | 
16 | in fatcat, could have a release for each version, but only one with
17 | the DOI; or could have a separate fileset for each version
18 | 


--------------------------------------------------------------------------------
/match_test_data/RESULTS.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | "Out of 944 lines"...
 3 | 
 4 | ## Git 92584ec4201ecc27af423cbff7b4bc1573edf175
 5 | 
 6 | 76.27% match.
 7 | 
 8 |     time ./please --qa match-benchmark match_test_data/crossref_sample.bibjson match_test_data/grobid_sample.bibjson out.test
 9 | 
10 |     real    0m56.061s
11 |     user    1m3.852s
12 |     sys     0m3.924s
13 | 
14 |     720 lines
15 |     720 uniq DOI
16 |     720 uniq SHA1
17 | 
18 | ## Git aa2f905d65713a581c7630ef2f931045059200ef
19 | 
20 |     real    0m56.347s
21 |     user    1m3.328s
22 |     sys     0m4.000s
23 | 
24 |     bnewbold@orithena$ wc -l out.test 
25 |     722 out.test
26 |     bnewbold@orithena$ cut -f3 out.test | jq .doi -r | sort -u | wc -l
27 |     722
28 |     bnewbold@orithena$ cut -f4 out.test | jq .sha1 -r | sort -u | wc -l
29 |     722
30 | 
31 | 


--------------------------------------------------------------------------------
/sql/reingest_terminalstatus_forcerecrawl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | set -o pipefail     # fail if part of a '|' command fails
 6 | 
 7 | sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
 8 | 
 9 | cd ../python
10 | sudo -u sandcrawler pipenv run \
11 |     ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
12 |     > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
13 | 
14 | cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
15 |     | shuf \
16 |     | head -n100000 \
17 |     | jq . -c \
18 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
19 | 
20 | 


--------------------------------------------------------------------------------
/notes/examples/random_datasets.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Possible external datasets to ingest (which are not entire platforms):
 3 | 
 4 | - https://research.google/tools/datasets/
 5 | - https://openslr.org/index.html
 6 | - https://www.kaggle.com/datasets?sort=votes&tasks=true
 7 | - https://archive.ics.uci.edu/ml/datasets.php
 8 | 
 9 | Existing archive.org datasets to ingest:
10 | 
11 | - https://archive.org/details/allthemusicllc-datasets
12 | 
13 | Papers on archive.org to ingest:
14 | 
15 | - <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
16 | - <https://archive.org/details/biorxiv>
17 | - <https://archive.org/details/philosophicaltransactions?tab=collection>
18 | - <https://archive.org/search.php?query=doi%3A%2A>
19 | - <https://archive.org/details/folkscanomy_academic>
20 | 


--------------------------------------------------------------------------------
/notes/backfill_scalding_rewrite.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Background context needed:
 3 | - CDX text file format
 4 | - rough arch outline (what runs where)
 5 | - basic hadoop+hbase overview
 6 | - hbase schema
 7 | - quick look at hadoop and hbase web interfaces
 8 | - maybe quick re-profile?
 9 | 
10 | Plan/Steps:
11 | x together: get *any* JVM map/reduce thing to build and run on cluster
12 | x together: get something to build that talks to hbase
13 | x basic JVM test infra; HBase mockup. "shopping"
14 |     => scalding and/or cascading
15 | x simple hbase scan report generation (counts/stats)
16 | x CDX parsing
17 | - complete backfill script
18 | 
19 | Spec for CDX backfill script:
20 | - input is CDX, output to HBase table
21 | - filter input before anything ("defensive"; only PDF, HTTP 200, size limit)
22 | - reads HBase before insert; don't overwrite
23 | 


--------------------------------------------------------------------------------
/sql/dump_unextracted_pdf.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Run like:
 3 | --   psql sandcrawler < dump_unextracted_pdf.sql
 4 | 
 5 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 6 | 
 7 | COPY (
 8 |   SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
 9 |   FROM grobid
10 |   LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
11 |   --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
12 |   LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
13 |   LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
14 |   WHERE cdx.sha1hex IS NOT NULL
15 |     --AND fatcat_file.sha1hex IS NOT NULL
16 |     AND ingest_file_result.terminal_sha1hex IS NOT NULL
17 |     AND pdf_meta.sha1hex IS NULL
18 | )
19 | TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
20 | WITH NULL '';
21 | 
22 | ROLLBACK;
23 | 


--------------------------------------------------------------------------------
/python/tests/test_grobid2json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import xml
 3 | 
 4 | import pytest
 5 | from grobid_tei_xml import parse_document_xml
 6 | 
 7 | 
 8 | def test_small_xml():
 9 |     """
10 |     This used to be a test of grobid2json; now it is a compatability test for
11 |     the to_legacy_dict() feature of grobid_tei_xml.
12 |     """
13 | 
14 |     with open("tests/files/small.xml", "r") as f:
15 |         tei_xml = f.read()
16 |     with open("tests/files/small.json", "r") as f:
17 |         json_form = json.loads(f.read())
18 | 
19 |     tei_doc = parse_document_xml(tei_xml)
20 |     assert tei_doc.to_legacy_dict() == json_form
21 | 
22 | 
23 | def test_invalid_xml():
24 | 
25 |     with pytest.raises(xml.etree.ElementTree.ParseError):
26 |         parse_document_xml("this is not XML")
27 |     with pytest.raises(ValueError):
28 |         parse_document_xml("<xml></xml>")
29 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-09_scielo.md:
--------------------------------------------------------------------------------
 1 | 
 2 | As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing
 3 | fatcat releases with no IA copy and with `publisher_type:scielo`. There are
 4 | 200k+ such releases.
 5 | 
 6 | It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008
 7 | 
 8 | Could try XML ingest of these!
 9 | 
10 | ## Bulk Ingest
11 | 
12 | Dump ingest requests
13 | 
14 |     ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json
15 |     Expecting 212529 release objects in search queries
16 | 
17 | Enqueue
18 | 
19 |     cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
20 |     => done 2020-09-14
21 | 
22 | 


--------------------------------------------------------------------------------
/notes/crawl_cdx_merge.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## New Way
 3 | 
 4 | Run script from scratch repo:
 5 | 
 6 |     ~/scratch/bin/cdx_collection.py CRAWL-2000
 7 | 
 8 |     zcat CRAWL-2000.cdx.gz | wc -l
 9 | 
10 |     # update crawl README/ANALYSIS/whatever
11 | 
12 | Assuming we're just looking at PDFs:
13 | 
14 |     zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u > CRAWL-2000.sorted.cdx
15 | 
16 | ## Old Way
17 | 
18 | Use metamgr to export an items list.
19 | 
20 | Get all the CDX files and merge/sort:
21 | 
22 |     mkdir CRAWL-2000 && cd CRAWL-2000
23 |     cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz
24 |     ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx
25 |     sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
26 |     wc -l CRAWL-2000.cdx
27 |     rm CRAWL-2000.unsorted.cdx
28 | 
29 |     # gzip and upload to petabox, or send to HDFS, or whatever
30 | 


--------------------------------------------------------------------------------
/sql/table_sizes.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## September 2019 
 3 | 
 4 |                               table_name                          | table_size | indexes_size | total_size
 5 |     --------------------------------------------------------------+------------+--------------+------------
 6 |      "public"."cdx"                                               | 31 GB      | 27 GB        | 58 GB
 7 |      "public"."file_meta"                                         | 13 GB      | 6500 MB      | 19 GB
 8 |      "public"."shadow"                                            | 8303 MB    | 9216 MB      | 17 GB
 9 |      "public"."grobid"                                            | 4994 MB    | 6678 MB      | 11 GB
10 |      "public"."fatcat_file"                                       | 5206 MB    | 2094 MB      | 7300 MB
11 |      "public"."petabox"                                           | 403 MB     | 594 MB       | 997 MB
12 | 


--------------------------------------------------------------------------------
/pig/tests/test_filter_cdx_paper_pdfs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | from pighelper import PigTestHelper, count_lines
 5 | 
 6 | 
 7 | class TestFilterCDXPaperPdfs(PigTestHelper):
 8 | 
 9 |     def test_papers_domain_words(self):
10 |         r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx")
11 |         assert count_lines(r) == 4
12 | 
13 |     def test_papers_edu_tilde(self):
14 |         r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx")
15 |         assert count_lines(r) == 6
16 | 
17 |     def test_papers_url_doi(self):
18 |         r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx")
19 |         assert count_lines(r) == 2
20 | 
21 |     def test_papers_url_words(self):
22 |         r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx")
23 |         assert count_lines(r) == 12
24 | 
25 | 


--------------------------------------------------------------------------------
/python/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | select = C,E,F,W,ANN
 3 | # ANN003 is annotation on, eg, **kwargs
 4 | # ANN101 is annotation on 'self' (why would that be wanted?)
 5 | # ANN204 is annotation on '__init__()'
 6 | # ANN401 is 'Any' type
 7 | # E265,E266 are restrictions on comments ('#')
 8 | # E501 is line-too-long, which we enforce with black
 9 | # W503,E203 are allowed by black
10 | # TODO: C901 is complexity, should be re-enabled at some point
11 | ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
12 | per-file-ignores =
13 |     sandcrawler/__init__.py: F401
14 |     sandcrawler/ia.py: E402
15 |     tests/*.py: ANN201,ANN001,F403,F405
16 |     # TODO: add more annotations to CLI scripts
17 |     *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405
18 |     scripts:  ANN201,ANN001,ANN202,ANN206,ANN205
19 | exclude = .git,__pycache__,.venv,scripts/
20 | max-line-length = 96
21 | max-complexity = 30
22 | 


--------------------------------------------------------------------------------
/notes/possible_ingest_targets.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | - all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
 3 | 
 4 | more complex crawling/content:
 5 | - add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
 6 | - watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
 7 | - www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
 8 | - doi.ala.org.au: possible dataset ingest source
 9 | - peerj.com, at least reviews, should be HTML ingest? or are some PDF?
10 | - publons.com should be HTML ingest, possibly special case for scope
11 | - frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
12 | 
13 | other tasks:
14 | - handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
15 | - push/deploy sandcrawler changes
16 | 


--------------------------------------------------------------------------------
/extra/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   zookeeper:
 4 |     image: wurstmeister/zookeeper
 5 |     ports:
 6 |       - "2181:2181"
 7 |   kafka:
 8 |     image: wurstmeister/kafka:2.11-2.0.0
 9 |     ports:
10 |       - "9092:9092"
11 |     environment:
12 |       #HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
13 |       KAFKA_BROKER_ID: 1
14 |       KAFKA_ADVERTISED_HOST_NAME: 127.0.0.1
15 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
16 |       KAFKA_CREATE_TOPICS: "fatcat-dev.changelog:1:1,fatcat-dev.release-updates:3:1:compact"
17 |       KAFKA_MESSAGE_MAX_BYTES: 50000000
18 |     volumes:
19 |       - /var/run/docker.sock:/var/run/docker.sock
20 |     depends_on:
21 |       - zookeeper
22 |   postgrest:
23 |     image: postgrest/postgrest
24 |     network_mode: "host"
25 |     ports:
26 |       - "3000:3000"
27 |     environment:
28 |       PGRST_DB_URI: "postgres://fatcat:tactaf@localhost/sandcrawler"
29 |       PGRST_DB_ANON_ROLE: "fatcat"
30 | 


--------------------------------------------------------------------------------
/notes/tasks/2022-01-07_grobid_platform_pdfs.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Martin crawled more than 10 million new PDFs from various platform domains. We
 3 | should get these processed and included in sandcrawler-db.
 4 | 
 5 | ## Select CDX Rows
 6 | 
 7 |     COPY (
 8 |         SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
 9 |         FROM cdx
10 |         LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
11 |         WHERE
12 |             grobid.sha1hex IS NULL
13 |             AND cdx.sha1hex IS NOT NULL
14 |             AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
15 |         -- LIMIT 5;
16 |     )
17 |     TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
18 |     WITH NULL '';
19 |     => COPY 8801527
20 | 
21 |     cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
22 | 
23 |     # for pdfextract, would be: sandcrawler-prod.unextracted
24 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-pdfs.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype.
 3 | --
 4 | -- Author: Bryan Newbold <bnewbold@archive.org>
 5 | -- Date: May 2018
 6 | 
 7 | %default INPUT ''
 8 | %default OUTPUT ''
 9 | 
10 | set mapreduce.job.queuename default
11 | 
12 | cdx = LOAD '$INPUT' AS cdxline:chararray;
13 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
14 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
15 | 
16 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
17 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
18 | cdx = FILTER cdx BY not url matches '-';
19 | cdx = FILTER cdx BY httpstatus matches '200';
20 | cdx = FILTER cdx BY mimetype matches '.*pdf.*';
21 | cdx = ORDER cdx by url, timestamp PARALLEL 50;
22 | cdx = FOREACH cdx GENERATE cdxline;
23 | STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
24 | 
25 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-ps.pig:
--------------------------------------------------------------------------------
 1 | -- Tries to filter down a large CDX file (GWB index) to a subset of postscript
 2 | -- files, by mimetype.
 3 | --
 4 | -- Author: Bryan Newbold <bnewbold@archive.org>
 5 | -- Date: May 2018
 6 | 
 7 | %default INPUT ''
 8 | %default OUTPUT ''
 9 | 
10 | set mapreduce.job.queuename default
11 | 
12 | cdx = LOAD '$INPUT' AS cdxline:chararray;
13 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
14 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
15 | 
16 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
17 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
18 | cdx = FILTER cdx BY not url matches '-';
19 | cdx = FILTER cdx BY httpstatus matches '200';
20 | cdx = FILTER cdx BY mimetype matches '.*postscript.*';
21 | cdx = ORDER cdx by url, timestamp PARALLEL 50;
22 | cdx = FOREACH cdx GENERATE cdxline;
23 | STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
24 | 
25 | 


--------------------------------------------------------------------------------
/python/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | SHELL = /bin/bash
 3 | .SHELLFLAGS = -o pipefail -c
 4 | 
 5 | .PHONY: help
 6 | help: ## Print info about all commands
 7 | 	@echo "Commands:"
 8 | 	@echo
 9 | 	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "    \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
10 | 
11 | .PHONY: deps
12 | deps: ## Install dependencies using pipenv
13 | 	pipenv install --dev
14 | 
15 | .PHONY: lint
16 | lint: ## Run lints (eg, flake8, mypy)
17 | 	pipenv run flake8 . --exit-zero
18 | 	pipenv run isort -q -c . || true
19 | 	pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
20 | 
21 | .PHONY: fmt
22 | fmt: ## Run code formating on all source code
23 | 	pipenv run isort --atomic .
24 | 	pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py
25 | 
26 | .PHONY: test
27 | test: ## Run all tests and lints
28 | 	pipenv run pytest
29 | 
30 | .PHONY: coverage
31 | coverage: ## Run all tests with coverage
32 | 	pipenv run pytest --cov --cov-report=term --cov-report=html
33 | 


--------------------------------------------------------------------------------
/Dockerfile.sandcrawler-pytest:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM ubuntu:focal
 4 | WORKDIR /src
 5 | COPY . .
 6 | ENV LC_ALL=C.UTF-8
 7 | ENV LANG=C.UTF-8
 8 | ENV DEBIAN_FRONTEND=noninteractive
 9 | # copied and modified from gitlab ci yml file
10 | RUN apt update && apt install -y python3-dev python3-pip python3-wheel libjpeg-dev libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget && pip install pipenv pytest
11 | #RUN git config --global --add safe.directory /src
12 | WORKDIR python
13 | RUN make deps
14 | CMD make test
15 | 
16 | # Build
17 | # NB: requires sshuttle or similar bc build process talks to devpi.us.archive.org
18 | # docker build --network=host -t sandcrawler-pytest -f Dockerfile.sandcrawler-pytest .
19 | 
20 | # Run, adjusting source path as needed
21 | # docker run --network host -v/home/vilmibm/src/sandcrawler:/src sandcrawler-pytest
22 | 


--------------------------------------------------------------------------------
/python/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | 
 3 | # allow imports from files in current directory
 4 | python_paths = .
 5 | 
 6 | # search for 'test_*' functions in all python files, not just under tests
 7 | python_files = *.py
 8 | 
 9 | addopts = --pylint --pylint-rcfile=.pylintrc --pylint-error-types=EF --pylint-jobs=4
10 | 
11 | # ignore various third party warnings (in .venv)
12 | filterwarnings =
13 |     ignore:.*common_exception_handling.*StopIteration:PendingDeprecationWarning
14 |     ignore:.*deprecated and will be removed in Werkzeug 1.0.*:DeprecationWarning
15 |     ignore::DeprecationWarning:.*surt
16 |     ignore::DeprecationWarning:.*urllib3
17 |     ignore::DeprecationWarning:.*wayback
18 |     ignore::DeprecationWarning:.*PIL
19 |     ignore::DeprecationWarning:.*justext
20 |     ignore::DeprecationWarning:.*internetarchive
21 |     ignore::DeprecationWarning:.*minio
22 |     ignore::DeprecationWarning:.*base_reporter
23 |     ignore::DeprecationWarning:.*loccache
24 |     ignore:.*pytz-deprecation-shim
25 | 
26 | log_level = INFO
27 | 


--------------------------------------------------------------------------------
/python/tests/test_pushers.py:
--------------------------------------------------------------------------------
 1 | from sandcrawler.workers import BlackholeSink, CdxLinePusher
 2 | 
 3 | 
 4 | def test_cdx_line_pusher():
 5 | 
 6 |     sink = BlackholeSink()
 7 | 
 8 |     # vanilla (only default filters)
 9 |     with open("tests/files/example.cdx", "r") as cdx_file:
10 |         pusher = CdxLinePusher(sink, cdx_file)
11 |         counts = pusher.run()
12 |     assert counts["total"] == 20
13 |     assert counts["skip-parse"] == 1
14 |     assert counts["pushed"] == 19
15 | 
16 |     # HTTP 200 and application/pdf
17 |     with open("tests/files/example.cdx", "r") as cdx_file:
18 |         pusher = CdxLinePusher(
19 |             sink,
20 |             cdx_file,
21 |             filter_mimetypes=["application/pdf"],
22 |             filter_http_statuses=[200, 226],
23 |         )
24 |         counts = pusher.run()
25 |     assert counts["total"] == 20
26 |     assert counts["skip-parse"] == 1
27 |     assert counts["skip-http_status"] == 10
28 |     assert counts["skip-mimetype"] == 2
29 |     assert counts["pushed"] == 7
30 | 


--------------------------------------------------------------------------------
/sql/reingest_weekly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | # can't use pipefail here because under normal operations kafkacat will exit
 6 | # code with a 141 (indicating that a pipe ran out of stuff for it to read).
 7 | # this will always trigger this file to report failure and thus lead to
 8 | # perpetually failing this when used in a systemd service.
 9 | #set -o pipefail     # fail if part of a '|' command fails
10 | 
11 | sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
12 | 
13 | cd ../python
14 | sudo -u sandcrawler pipenv run \
15 |     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \
16 |     > /srv/sandcrawler/tasks/reingest_weekly_current.json
17 | 
18 | cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
19 |     | shuf \
20 |     | head -n80000 \
21 |     | jq . -c \
22 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
23 | 
24 | 


--------------------------------------------------------------------------------
/sql/reingest_quarterly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e              # fail on error
 4 | set -u              # fail if variable not set in substitution
 5 | # can't use pipefail here because under normal operations kafkacat will exit
 6 | # code with a 141 (indicating that a pipe ran out of stuff for it to read).
 7 | # this will always trigger this file to report failure and thus lead to
 8 | # perpetually failing this when used in a systemd service.
 9 | #set -o pipefail     # fail if part of a '|' command fails
10 | 
11 | sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
12 | 
13 | cd ../python
14 | sudo -u sandcrawler pipenv run \
15 |     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \
16 |     > /srv/sandcrawler/tasks/reingest_quarterly_current.json
17 | 
18 | cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
19 |     | shuf \
20 |     | head -n120000 \
21 |     | jq . -c \
22 |     | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
23 | 
24 | 


--------------------------------------------------------------------------------
/pig/tests/files/tarballs.cdx:
--------------------------------------------------------------------------------
 1 | #http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
 2 | #http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
 3 | #http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
 4 | #http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
 5 | 
 6 | # should match 2:
 7 | 
 8 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
 9 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/octet-stream 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
10 | org,sgmjournals,ijs)//cgi/reprint/54/6/2217.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/gzip 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
11 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import parallelai.spyglass.base.JobBase
 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
10 | import parallelai.spyglass.hbase.HBasePipeConversions
11 | import parallelai.spyglass.hbase.HBaseSource
12 | 
13 | class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
14 | 
15 |   val output = args("output")
16 | 
17 |   HBaseRowCountJob.getHBaseSource(
18 |     args("hbase-table"),
19 |     args("zookeeper-hosts"))
20 |     .read
21 |     .debug
22 |     .groupAll { _.size('count) }
23 |     .write(Tsv(output))
24 | }
25 | 
26 | object HBaseRowCountJob {
27 | 
28 |   // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
29 |   def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
30 |     HBaseBuilder.build(
31 |       hbaseTable,
32 |       zookeeperHosts,
33 |       List("f:c"),
34 |       SourceMode.SCAN_ALL)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/python/scripts/pdf_thumbnail.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
 4 | 
 5 | Originally used to benchmark and compare file size/quality.
 6 | """
 7 | 
 8 | import sys
 9 | 
10 | import poppler
11 | from PIL import Image
12 | 
13 | 
14 | def run(inpath, outpath):
15 | 
16 |     try:
17 |         pdf = poppler.load_from_file(inpath)
18 |         page = pdf.create_page(0)
19 |     except Exception as e:
20 |         print(str(e), file=sys.stderr)
21 |         sys.exit(0)
22 | 
23 |     renderer = poppler.PageRenderer()
24 |     full_page = renderer.render_page(page)
25 |     img = Image.frombuffer(
26 |         "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
27 |     )
28 |     img.thumbnail((180, 300), Image.BICUBIC)
29 |     # img.thumbnail((360,600), Image.BICUBIC)
30 |     img.save(outpath)
31 |     # img.save(outpath, quality=95)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     if len(sys.argv) != 3:
36 |         print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
37 |         sys.exit(-1)
38 |     run(sys.argv[1], sys.argv[2])
39 | 


--------------------------------------------------------------------------------
/sql/stats/2021-11-01_table_sizes.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Size: 832.66G
 3 | 
 4 |           table_name           | table_size | indexes_size | total_size 
 5 | -------------------------------+------------+--------------+------------
 6 |  "public"."crossref"           | 311 GB     | 9812 MB      | 320 GB
 7 |  "public"."ingest_request"     | 44 GB      | 40 GB        | 84 GB
 8 |  "public"."cdx"                | 52 GB      | 28 GB        | 80 GB
 9 |  "public"."grobid"             | 72 GB      | 6952 MB      | 79 GB
10 |  "public"."ingest_file_result" | 38 GB      | 40 GB        | 78 GB
11 |  "public"."grobid_shadow"      | 67 GB      | 5455 MB      | 73 GB
12 |  "public"."file_meta"          | 34 GB      | 21 GB        | 54 GB
13 |  "public"."pdf_meta"           | 20 GB      | 5813 MB      | 26 GB
14 |  "public"."fatcat_file"        | 12 GB      | 6602 MB      | 18 GB
15 |  "public"."shadow"             | 9517 MB    | 8026 MB      | 17 GB
16 |  "public"."html_meta"          | 1200 MB    | 8072 kB      | 1208 MB
17 |  "public"."petabox"            | 403 MB     | 461 MB       | 864 MB
18 |  "public"."pdftrio"            | 550 MB     | 297 MB       | 847 MB
19 | (13 rows)
20 | 


--------------------------------------------------------------------------------
/notes/ingest/es_csv_to_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 |     input like:
 5 | 
 6 |         doi,ident,"release_stage"
 7 |         "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published
 8 |         "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published
 9 |         "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published
10 |         "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published
11 |         "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published
12 | 
13 |     output like:
14 | 
15 |     {
16 |       "base_url": "https://doi.org/10.7554/elife.38904",
17 |       "ext_ids": {
18 |         "doi": "10.7554/elife.38904"
19 |       },
20 |       "fatcat_release": "mxj534diw5gatc26rkif3io5xm",
21 |       "release_stage": "published"
22 |     }
23 | """
24 | 
25 | import csv, sys, json
26 | 
27 | reader = csv.DictReader(sys.stdin)
28 | for row in reader:
29 |     d = {
30 |       "base_url": "https://doi.org/{}".format(row['doi']),
31 |       "ext_ids": {
32 |         "doi": row['doi'],
33 |       },
34 |       "fatcat_release": row['ident'],
35 |       "release_stage": row['release_stage'],
36 |     }
37 |     print(json.dumps(d))
38 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-03-oa_but_not_marked.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are large journals with a high fraction of "in IA", but not marked as OA
 3 | so not crawling regularly.
 4 | 
 5 | TODO: add things like list of unpaywall ISSN / OA status to try and find more
 6 | "practical" / bronze OA
 7 | 
 8 | ## First Run
 9 | 
10 | https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
11 | https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
12 | https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
13 | https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
14 | https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
15 | https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
16 | 
17 | ## TODO
18 | 
19 | https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
20 | https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
21 | 
22 | https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
23 | https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
24 | https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
25 | https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
26 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import parallelai.spyglass.base.JobBase
 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
10 | import parallelai.spyglass.hbase.HBasePipeConversions
11 | import parallelai.spyglass.hbase.HBaseSource
12 | 
13 | class HBaseColCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
14 | 
15 |   val output = args("output")
16 | 
17 |   HBaseColCountJob.getHBaseSource(
18 |     args("hbase-table"),
19 |     args("zookeeper-hosts"),
20 |     args("column"))
21 |     .read
22 |     .debug
23 |     .groupAll { _.size('count) }
24 |     .write(Tsv(output))
25 | }
26 | 
27 | object HBaseColCountJob {
28 | 
29 |   // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
30 |   def getHBaseSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
31 |     HBaseBuilder.build(
32 |       hbaseTable,
33 |       zookeeperHosts,
34 |       List(col),
35 |       SourceMode.SCAN_ALL)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/pig/tests/files/sourcecode.cdx:
--------------------------------------------------------------------------------
1 | # match
2 | edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.java 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.java text/plain 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
3 | # no
4 | fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pyc 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pyc text/plain 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
5 | # no
6 | org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
7 | 


--------------------------------------------------------------------------------
/pig/tests/files/papers_domain_words.cdx:
--------------------------------------------------------------------------------
 1 | #http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
 2 | #http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
 3 | #http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
 4 | #http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
 5 | 
 6 | # should match 4:
 7 | 
 8 | edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
 9 | org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
10 | uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
11 | au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 QQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
12 | 


--------------------------------------------------------------------------------
/python/scripts/enrich_scored_matches.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Takes an "joined" TSV input stream:
 4 | 
 5 | - sha1
 6 | - dois (JSON list)
 7 | - cdx (JSON object)
 8 |     - url
 9 |     - dt
10 |     (etc)
11 | - mimetype
12 | - size (integer)
13 | 
14 | And outputs JSON objects that are can be imported into fatcat with the
15 | "matched" script.
16 | 
17 | No dependencies (only python3 stdlib)
18 | """
19 | 
20 | import base64
21 | import json
22 | import sys
23 | 
24 | 
25 | def run():
26 |     for line in sys.stdin:
27 |         line = line.split("\t")
28 |         assert len(line) == 5
29 |         raw_sha1 = line[0].replace("sha1:", "")
30 |         dois = json.loads(line[1])
31 |         cdx = json.loads(line[2])
32 |         mimetype = line[3]
33 |         size = int(line[4])
34 | 
35 |         sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
36 | 
37 |         obj = dict(
38 |             sha1=sha1,
39 |             dois=dois,
40 |             cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
41 |             size=size,
42 |             mimetype=mimetype,
43 |         )
44 |         print(json.dumps(obj))
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     run()
49 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | class HBaseStatusCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
17 | 
18 |   val source = HBaseCountJob.getHBaseSource(
19 |     args("hbase-table"),
20 |     args("zookeeper-hosts"),
21 |     "grobid0:status")
22 | 
23 |   val statusPipe : TypedPipe[String] = source
24 |     .read
25 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status)
26 |     .map { case (key, raw_status) => Bytes.toString(raw_status.copyBytes()) }
27 | 
28 |   statusPipe.groupBy { identity }
29 |     .size
30 |     .debug
31 |     .write(TypedTsv[(String,Long)](args("output")))
32 | }
33 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | class HBaseStatusCodeCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
17 | 
18 |   val source = HBaseCountJob.getHBaseSource(
19 |     args("hbase-table"),
20 |     args("zookeeper-hosts"),
21 |     "grobid0:status_code")
22 | 
23 |   val statusPipe : TypedPipe[Long] = source
24 |     .read
25 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code)
26 |     .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) }
27 | 
28 |   statusPipe.groupBy { identity }
29 |     .size
30 |     .debug
31 |     .write(TypedTsv[(Long,Long)](args("output")))
32 | }
33 | 


--------------------------------------------------------------------------------
/pig/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera
 3 | Distribution).
 4 | 
 5 | "Local mode" unit tests in this folder run with Pig version 0.17.0 (controlled
 6 | by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of
 7 | 0.12.0.
 8 | 
 9 | [pig-bug]: https://issues.apache.org/jira/browse/PIG-3530
10 | 
11 | ## Development and Testing
12 | 
13 | To run tests, you need Java installed and `JAVA_HOME` configured.
14 | 
15 | Fetch dependencies (including pig) from top-level directory:
16 | 
17 |     ./fetch_hadoop.sh
18 | 
19 | Write `.pig` scripts in this directory, and add a python wrapper test to
20 | `./tests/` when done.  Test vector files (input/output) can go in
21 | `./tests/files/`.
22 | 
23 | Run the tests with:
24 | 
25 |     pipenv run pytest
26 | 
27 | Could also, in theory, use a docker image ([local-pig][]), but it's pretty easy
28 | to just download.
29 | 
30 | [local-pig]: https://hub.docker.com/r/chalimartines/local-pig
31 | 
32 | ## Run in Production
33 | 
34 |     pig -param INPUT="/user/bnewbold/pdfs/global-20171227034923" \
35 |         -param OUTPUT="/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter" \
36 |         filter-cdx-paper-pdfs.pig
37 | 


--------------------------------------------------------------------------------
/notes/ingest/2023-10_dimensions.md:
--------------------------------------------------------------------------------
 1 | # Dimensions OA list
 2 | 
 3 | In 09/2023 dimensions.ai handed over a list of 11667892 DOI and URL, 1613390 we
 4 | found an exact match for in the fatcat file entity data. 11040477 URLs we could
 5 | lookup successfully via CDX index and found that 2526822 URLs were not in GWB
 6 | as of 2023-10-16.
 7 | 
 8 | Top 20 domains:
 9 | 
10 | ```
11 |  613732 doi.org
12 |  150592 europepmc.org
13 |   98725 academic.oup.com
14 |   47932 journals.lww.com
15 |   46191 www.ncbi.nlm.nih.gov
16 |   31808 www.biodiversitylibrary.org
17 |   30290 arxiv.org
18 |   28737 zenodo.org
19 |   28375 hdl.handle.net
20 |   26226 pubs.aip.org
21 |   25667 dergipark.org.tr
22 |   17771 pubs.lib.uiowa.edu
23 |   17220 www.cairn.info
24 |   17134 osf.io
25 |   17035 www.mdpi.com
26 |   15459 archive.org
27 |   12586 www.preprints.org
28 |   11171 ojs.omniscient.sg
29 |   10938 hal.science
30 |    9918 dl.acm.org
31 | ```
32 | 
33 | Out of these 2.5M URLs alone, we could guess about 21350 OAI/OJS endpoints, we
34 | did not know about before.
35 | 
36 | More on the comparison: [https://git.archive.org/martin/scratch/-/tree/master/SPECPRJCTS-3102-Dimensions](https://git.archive.org/martin/scratch/-/tree/master/SPECPRJCTS-3102-Dimensions)
37 | 
38 | 


--------------------------------------------------------------------------------
/notes/tasks/2020-01-27_cleanup_cdx.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Accidentally seem to have backfilled many CDX lines with non-PDF content.
 3 | Should clear these out!
 4 | 
 5 | Something like:
 6 | 
 7 |     mimetype = 'text/html'
 8 |     not in file_meta
 9 | 
10 | Or maybe instead:
11 | 
12 |     mimetype = 'text/html'
13 |     not in file_meta
14 | 
15 | SQL:
16 | 
17 |     SELECT *        FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5;
18 |     SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01';
19 |     => 24841846
20 | 
21 |     SELECT *        FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5;
22 |     SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL;
23 |     => 24547552
24 | 
25 |     DELETE FROM cdx
26 |       WHERE sha1hex IN
27 |       (SELECT cdx.sha1hex
28 |        FROM cdx
29 |        LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex
30 |        WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL);
31 |     => DELETE 24553428
32 | 
33 | Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in
34 | the DELETE WHERE clause.
35 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/HBaseCountJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import parallelai.spyglass.base.JobBase
 9 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
10 | import parallelai.spyglass.hbase.HBasePipeConversions
11 | import parallelai.spyglass.hbase.HBaseSource
12 | 
13 | class HBaseCountJob(args: Args, colSpec: String) extends JobBase(args) with HBasePipeConversions {
14 |   val output = args("output")
15 |   HBaseBuilder.parseColSpec(colSpec)
16 |   val Col: String = colSpec.split(":")(1)
17 | 
18 |   HBaseCountJob.getHBaseSource(
19 |     args("hbase-table"),
20 |     args("zookeeper-hosts"),
21 |     colSpec)
22 |     .read
23 |     .fromBytesWritable(Symbol(Col))
24 |     .debug
25 |     .groupBy(Col){group => group.size('count)}
26 |     .write(Tsv(output))
27 | }
28 | 
29 | object HBaseCountJob {
30 |   def getHBaseSource(hbaseTable: String, zookeeperHosts: String, colSpec: String) : HBaseSource = HBaseBuilder.build(
31 |     hbaseTable,      // HBase Table Name
32 |     zookeeperHosts,  // HBase Zookeeper server (to get runtime config info; can be array?)
33 |     List(colSpec),
34 |     SourceMode.SCAN_ALL)
35 | }
36 | 


--------------------------------------------------------------------------------
/sql/migrations/00000000000000_diesel_initial_setup/up.sql:
--------------------------------------------------------------------------------
 1 | -- This file was automatically created by Diesel to setup helper functions
 2 | -- and other internal bookkeeping. This file is safe to edit, any future
 3 | -- changes will be added to existing projects as new migrations.
 4 | 
 5 | 
 6 | 
 7 | 
 8 | -- Sets up a trigger for the given table to automatically set a column called
 9 | -- `updated_at` whenever the row is modified (unless `updated_at` was included
10 | -- in the modified columns)
11 | --
12 | -- # Example
13 | --
14 | -- ```sql
15 | -- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
16 | --
17 | -- SELECT diesel_manage_updated_at('users');
18 | -- ```
19 | CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
20 | BEGIN
21 |     EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
22 |                     FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
23 | END;
24 | $$ LANGUAGE plpgsql;
25 | 
26 | CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
27 | BEGIN
28 |     IF (
29 |         NEW IS DISTINCT FROM OLD AND
30 |         NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
31 |     ) THEN
32 |         NEW.updated_at := current_timestamp;
33 |     END IF;
34 |     RETURN NEW;
35 | END;
36 | $$ LANGUAGE plpgsql;
37 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.pipe.Pipe
 4 | import com.twitter.scalding.Args
 5 | import com.twitter.scalding.TypedPipe
 6 | import com.twitter.scalding.TypedTsv
 7 | import parallelai.spyglass.base.JobBase
 8 | 
 9 | class MatchBenchmarkJob(args: Args) extends JobBase(args) {
10 |   // TODO: Instantiate any subclass of Scorable specified in args.
11 |   val sc1 : Scorable = new BibjsonScorable()
12 |   val sc2 : Scorable = new BibjsonScorable()
13 |   val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson")))
14 |   val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson")))
15 |   val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
16 |   val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
17 | 
18 |   pipe1.join(pipe2)
19 |   .map { entry =>
20 |     val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
21 |     new ReduceOutput(
22 |       slug,
23 |       Scorable.computeSimilarity(features1, features2),
24 |       features1.json,
25 |       features2.json)
26 |   }
27 |   //TypedTsv doesn't work over case classes.
28 |     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
29 |     .write(TypedTsv[(String, Int, String, String)](args("output")))
30 | }
31 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/example/SimpleHBaseSourceExample.scala:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import com.twitter.scalding.{Tsv, Args}
 4 | import parallelai.spyglass.base.JobBase
 5 | import org.apache.log4j.{Level, Logger}
 6 | import parallelai.spyglass.hbase.{HBasePipeConversions, HBaseSource}
 7 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 8 | import cascading.tuple.Fields
 9 | import cascading.property.AppProps
10 | import java.util.Properties
11 | 
12 | /**
13 |   * Simple example of HBaseSource usage
14 |   */
15 | class SimpleHBaseSourceExample(args: Args) extends JobBase(args) with HBasePipeConversions {
16 | 
17 |    val isDebug: Boolean = args("debug").toBoolean
18 | 
19 |    if (isDebug) Logger.getRootLogger.setLevel(Level.DEBUG)
20 | 
21 |    val output = args("output")
22 | 
23 |    val hbs = new HBaseSource(
24 |      "table_name",
25 |      //"quorum_name:2181",
26 |      "mtrcs-zk1.us.archive.org:2181",  // HBase Zookeeper server (to get runtime config info; can be array?)
27 |      new Fields("key"),
28 |      List("column_family"),
29 |      List(new Fields("column_name1", "column_name2")),
30 |      sourceMode = SourceMode.GET_LIST, keyList = List("1", "2", "3"))
31 |      .read
32 |      .debug
33 |      .fromBytesWritable(new Fields("key", "column_name1", "column_name2"))
34 |      .write(Tsv(output format "get_list"))
35 | 
36 |  }
37 | 


--------------------------------------------------------------------------------
/scalding/src/test/scala/example/WordCountTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | package example
17 | 
18 | import com.twitter.scalding.{ JobTest, TextLine, TypedTsv }
19 | import org.scalatest.{ Matchers, WordSpec }
20 | 
21 | class WordCountTest extends WordSpec with Matchers {
22 |   "A WordCount job" should {
23 |     JobTest(new example.WordCountJob(_))
24 |       .arg("input", "inputFile")
25 |       .arg("output", "outputFile")
26 |       .source(TextLine("inputFile"), List((0, "hack hack hack and hack")))
27 |       .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")){ outputBuffer =>
28 |         val outMap = outputBuffer.toMap
29 |         "count words correctly" in {
30 |           outMap("hack") shouldBe 4
31 |           outMap("and") shouldBe 1
32 |         }
33 |       }
34 |       .run
35 |       .finish()
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-tarball.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of tarballs
 3 | -- (.tar.gz). Intention is to find software code that isn't in, eg, git.
 4 | --
 5 | -- Author: Bryan Newbold <bnewbold@archive.org>
 6 | -- Date: May 2018
 7 | 
 8 | 
 9 | %default INPUT ''
10 | %default OUTPUT ''
11 | 
12 | set mapreduce.job.queuename default
13 | 
14 | cdx = LOAD '$INPUT' AS cdxline:chararray;
15 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
17 | 
18 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
19 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
20 | cdx = FILTER cdx BY not surt matches '-';
21 | cdx = FILTER cdx BY httpstatus matches '200';
22 | cdx = FILTER cdx BY mimetype matches '.*(octet|gzip|gtar|tgz).*';
23 | 
24 | -- This is the core regex
25 | cdx = FILTER cdx
26 |         -- .tar.gz in URL
27 |         BY surt matches '(?i).+\\).*\\.tar\\.gz.*';
28 | 
29 | -- DISTINCT by sha1 column
30 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
31 |     r = TOP(1, 0, $1);
32 |     GENERATE FLATTEN(r);
33 | };
34 | 
35 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
36 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
37 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
38 | 
39 | 


--------------------------------------------------------------------------------
/python/sandcrawler/__init__.py:
--------------------------------------------------------------------------------
 1 | from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
 2 | from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
 3 | from .ia import (
 4 |     CdxApiClient,
 5 |     CdxApiError,
 6 |     CdxPartial,
 7 |     CdxRow,
 8 |     PetaboxError,
 9 |     ResourceResult,
10 |     SavePageNowBackoffError,
11 |     SavePageNowClient,
12 |     SavePageNowError,
13 |     WarcResource,
14 |     WaybackClient,
15 |     WaybackContentError,
16 |     WaybackError,
17 | )
18 | from .ingest_file import IngestFileWorker
19 | from .ingest_fileset import IngestFilesetWorker
20 | from .misc import (
21 |     b32_hex,
22 |     clean_url,
23 |     gen_file_metadata,
24 |     gen_file_metadata_path,
25 |     parse_cdx_datetime,
26 |     parse_cdx_line,
27 | )
28 | from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
29 | from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
30 | from .persist import (
31 |     PersistCdxWorker,
32 |     PersistGrobidDiskWorker,
33 |     PersistGrobidWorker,
34 |     PersistIngestFileResultWorker,
35 |     PersistIngestRequestWorker,
36 |     PersistPdfTextWorker,
37 |     PersistPdfTrioWorker,
38 |     PersistThumbnailWorker,
39 | )
40 | from .workers import (
41 |     BlackholeSink,
42 |     CdxLinePusher,
43 |     JsonLinePusher,
44 |     KafkaCompressSink,
45 |     KafkaJsonPusher,
46 |     KafkaSink,
47 |     MultiprocessWrapper,
48 |     ZipfilePusher,
49 | )
50 | 


--------------------------------------------------------------------------------
/sql/stats/2021-12-02_table_sizes.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Size: 940.66G
 3 | 
 4 |              table_name             | table_size | indexes_size | total_size 
 5 | ------------------------------------+------------+--------------+------------
 6 |  "public"."crossref"                | 394 GB     | 10138 MB     | 404 GB
 7 |  "public"."ingest_request"          | 44 GB      | 41 GB        | 85 GB
 8 |  "public"."cdx"                     | 52 GB      | 28 GB        | 80 GB
 9 |  "public"."grobid"                  | 72 GB      | 6978 MB      | 79 GB
10 |  "public"."ingest_file_result"      | 38 GB      | 41 GB        | 78 GB
11 |  "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB
12 |  "public"."file_meta"               | 34 GB      | 21 GB        | 55 GB
13 |  "public"."pdf_meta"                | 20 GB      | 5930 MB      | 26 GB
14 |  "public"."grobid_refs"             | 19 GB      | 1752 MB      | 21 GB
15 |  "public"."fatcat_file"             | 13 GB      | 7314 MB      | 20 GB
16 |  "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB
17 |  "public"."html_meta"               | 1200 MB    | 8072 kB      | 1208 MB
18 |  "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB
19 |  "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB
20 |  "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB
21 |  "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes
22 | (16 rows)
23 | 


--------------------------------------------------------------------------------
/sql/stats/2020-01-31_supplement.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | How many file_meta still missing core metadata?
 3 | 
 4 |     SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
 5 |     => 1,130,915
 6 | 
 7 | Great! Not many.
 8 | 
 9 | And are in petabox?
10 | 
11 |     SELECT COUNT(*)
12 |     FROM file_meta
13 |     LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
14 |     WHERE file_meta.sha256hex IS NULL
15 |       AND file_meta.sha1hex IS NOT NULL;
16 |     => 1,149,194
17 | 
18 | Almost all; maybe just some CDX fetch failures or something in there. So,
19 | should run these on, eg, grobid2-vm.
20 | 
21 |     COPY (
22 |       SELECT row_to_json(petabox.*)
23 |       FROM file_meta
24 |       LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
25 |       WHERE file_meta.sha256hex IS NULL
26 |         AND file_meta.sha1hex IS NOT NULL
27 |     ) TO '/grande/snapshots/dump_grobid_petabox_todo.json';
28 | 
29 | Count of PDF files that GROBID processed and matched to a release (via
30 | glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a
31 | couple million files):
32 | 
33 |     SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
34 |     FROM grobid
35 |     LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
36 |     WHERE fatcat_file.sha1hex IS NULL
37 |       AND grobid.fatcat_release IS NOT NULL;
38 | 
39 |      total_count |  count  
40 |     -------------+---------
41 |          5072452 | 4130405
42 | 
43 | 


--------------------------------------------------------------------------------
/sql/stats/2022-11-23_table_sizes.txt:
--------------------------------------------------------------------------------
 1 | PostgreSQL 13.2 - wbgrp-svc506.us.archive.org
 2 | Size: 1.13T
 3 | 
 4 |              table_name             | table_size | indexes_size | total_size 
 5 | ------------------------------------+------------+--------------+------------
 6 |  "public"."crossref"                | 459 GB     | 10 GB        | 470 GB
 7 |  "public"."grobid"                  | 98 GB      | 13 GB        | 112 GB
 8 |  "public"."cdx"                     | 63 GB      | 45 GB        | 108 GB
 9 |  "public"."ingest_request"          | 53 GB      | 52 GB        | 105 GB
10 |  "public"."ingest_file_result"      | 46 GB      | 55 GB        | 100 GB
11 |  "public"."file_meta"               | 39 GB      | 40 GB        | 79 GB
12 |  "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB
13 |  "public"."pdf_meta"                | 24 GB      | 7466 MB      | 31 GB
14 |  "public"."grobid_refs"             | 28 GB      | 3306 MB      | 31 GB
15 |  "public"."fatcat_file"             | 13 GB      | 7314 MB      | 20 GB
16 |  "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB
17 |  "public"."html_meta"               | 7879 MB    | 68 MB        | 7947 MB
18 |  "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB
19 |  "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB
20 |  "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB
21 |  "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes
22 | 


--------------------------------------------------------------------------------
/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md:
--------------------------------------------------------------------------------
 1 | 
 2 | status: brainstorming
 3 | 
 4 | We continue to see issues with heritrix3-based crawling. Would like to have an
 5 | option to switch to higher-throughput heritrix-based crawling.
 6 | 
 7 | SPNv2 path would stick around at least for save-paper-now style ingest.
 8 | 
 9 | 
10 | ## Sketch
11 | 
12 | Ingest requests are created continuously by fatcat, with daily spikes.
13 | 
14 | Ingest workers run mostly in "bulk" mode, aka they don't make SPNv2 calls.
15 | `no-capture` responses are recorded in sandcrawler SQL database.
16 | 
17 | Periodically (daily?), a script queries for new no-capture results, filtered to
18 | the most recent period. These are processed in a bit in to a URL list, then
19 | converted to a heritrix frontier, and sent to crawlers. This could either be an
20 | h3 instance (?), or simple `scp` to a running crawl directory.
21 | 
22 | The crawler crawls, with usual landing page config, and draintasker runs.
23 | 
24 | TODO: can we have draintasker/heritrix set a maximum WARC life? Like 6 hours?
25 | or, target a smaller draintasker item size, so they get updated more frequently
26 | 
27 | Another SQL script dumps ingest requests from the *previous* period, and
28 | re-submits them for bulk-style ingest (by workers).
29 | 
30 | The end result would be things getting crawled and updated within a couple
31 | days.
32 | 
33 | 
34 | ## Sketch 2
35 | 
36 | Upload URL list to petabox item, wait for heritrix derive to run (!)
37 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-source-code-crude.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Tries to filter down a large CDX file (GWB index) to a subset of source code
 3 | -- files by mimetype and file extension.
 4 | -- This is pretty crude and requires the URL to end with the file extension.
 5 | ---
 6 | -- Author: Bryan Newbold <bnewbold@archive.org>
 7 | -- Date: October 2019
 8 | 
 9 | 
10 | %default INPUT ''
11 | %default OUTPUT ''
12 | 
13 | set mapreduce.job.queuename default
14 | 
15 | cdx = LOAD '$INPUT' AS cdxline:chararray;
16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
17 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
18 | 
19 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
20 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
21 | cdx = FILTER cdx BY not surt matches '-';
22 | cdx = FILTER cdx BY httpstatus matches '200';
23 | cdx = FILTER cdx BY mimetype matches '.*text.*';
24 | 
25 | -- This is the core regex
26 | cdx = FILTER cdx
27 | 
28 |         -- file suffix
29 |         BY surt matches '.*\\).*\\.(c|h|py|java)';
30 | 
31 | -- DISTINCT by sha1 column
32 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
33 |     r = TOP(1, 0, $1);
34 |     GENERATE FLATTEN(r);
35 | };
36 | 
37 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
38 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
39 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
40 | 
41 | 


--------------------------------------------------------------------------------
/notes/match_filter_enrich.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | This could all be a single scalding job eventually.
 3 | 
 4 | First, run matchcrossref and dumpfilemeta, and copy the output down to an SSD
 5 | somewhere.
 6 | 
 7 |     bnewbold@ia601101$ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | wc -l
 8 |     30728100
 9 | 
10 | Reduce down the scored matches to just {sha1, dois}, sorted:
11 | 
12 |     zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort -S 8G > 2018-08-27-2352.17-matchcrossref.filtered.tsv
13 |     # 5.79M 0:18:54 [5.11k/s]
14 | 
15 | Join/merge the output:
16 | 
17 |     zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | LC_ALL=C join -t$'\t' 2018-08-27-2352.17-matchcrossref.filtered.tsv - | pv -l | ./enrich_scored_matches.py | gzip > 2018-08-27-2352.17-matchcrossref.insertable.json.gz
18 |     # 5.79M 0:09:09 [10.5k/s]
19 | 
20 | ## Fatcat Insertable
21 | 
22 | I can't remember now what the plan was for the 'insertable' output mode, which
23 | bundles {key, cdx, mime, and size} info along with the {slug, score, json1,
24 | json2} columns from the regular match script. The filter_scored_matches.py
25 | doesn't know what to do with those columns at the moment, and the output isn't
26 | sorted by slug... need to tweak scripts to fix this.
27 | 
28 | In the meanwhile, as a work around just take the columns we want and re-sort:
29 | 
30 |     export LC_ALL=C
31 |     zcat 2018-12-18-2237.09-matchcrossref.insertable.tsv.gz | cut -f2-5 | sort -S 8G -u | gzip > 2018-12-18-2237.09-matchcrossref.tsv.gz
32 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | // Dumps status code for each GROBID-processed file. Good for crawl/corpus
17 | // analytics, if we consider GROBID status a rough "is this a paper" metric.
18 | class DumpGrobidStatusCodeJob(args: Args) extends JobBase(args) with HBasePipeConversions {
19 | 
20 |   val metaPipe : TypedPipe[(String, Long)] = HBaseBuilder.build(args("hbase-table"),
21 |                      args("zookeeper-hosts"),
22 |                      List("grobid0:status_code"),
23 |                      SourceMode.SCAN_ALL)
24 |     .read
25 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "status_code"))
26 |     .filter { case (_, status_code) => status_code != null }
27 |     .map { case (key, status_code) =>
28 |       (Bytes.toString(key.copyBytes()),
29 |        Bytes.toLong(status_code.copyBytes()))
30 |     };
31 | 
32 |   metaPipe.write(TypedTsv[(String,Long)](args("output")))
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/pig/tests/files/papers_edu_tilde.cdx:
--------------------------------------------------------------------------------
 1 | #http://www.stanford.edu:80/~johntayl/Papers/taylor2.pdf
 2 | #http://met.nps.edu/~mtmontgo/papers/isabel_part2.pdf
 3 | #http://www.pitt.edu:80/~druzdzel/psfiles/ecai06.pdf
 4 | #http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf
 5 | 
 6 | # should be 6 matches:
 7 | hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 LQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
 8 | edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 XQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
 9 | edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 PQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
10 | edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
11 | jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
12 | co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
13 | 
14 | # NOT:
15 | com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
16 | 


--------------------------------------------------------------------------------
/fetch_hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script was originally only for pig scripts; now it can also be used to
 4 | # run scalding code locally (via please)
 5 | 
 6 | set -euo pipefail
 7 | 
 8 | #PIG_VERSION="0.12.0-cdh5.2.0"
 9 | # Using more recent version to work around snappy classpath problem
10 | PIG_VERSION="0.17.0"
11 | HADOOP_VERSION="2.3.0-cdh5.0.1"
12 | 
13 | mkdir -p pig/deps/
14 | cd pig/deps/
15 | 
16 | # Fetch Hadoop Command
17 | echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz
18 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
19 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
20 | wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz
21 | echo "Extracting Hadoop (takes a minute)..."
22 | tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null
23 | ln -fs hadoop-${HADOOP_VERSION} hadoop
24 | 
25 | # Fetch Pig
26 | #wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
27 | #wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
28 | wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
29 | echo "Extracting Pig (takes a minute)..."
30 | tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null
31 | ln -fs pig-${PIG_VERSION} pig
32 | 
33 | # No 'readlink -f' on macOS
34 | # https://stackoverflow.com/a/24572274/4682349
35 | JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::")
36 | ./pig/bin/pig -x local -version
37 | ./hadoop/bin/hadoop version
38 | 
39 | 


--------------------------------------------------------------------------------
/python/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "ia"
 3 | url = "https://devpi.us.archive.org/wb/prod"
 4 | verify_ssl = true
 5 | 
 6 | [[source]]
 7 | name = "pypi"
 8 | url = "https://pypi.python.org/simple"
 9 | verify_ssl = true
10 | 
11 | [dev-packages]
12 | pytest = ">=4"
13 | pytest-pythonpath = "*"
14 | pytest-pylint = "*"
15 | responses = ">=0.10"
16 | pytest-cov = "*"
17 | pytest-mock = "*"
18 | pylint = "*"
19 | ipython = "*"
20 | mypy = "*"
21 | flake8 = "*"
22 | flake8-annotations = "*"
23 | isort = "*"
24 | types-requests = "*"
25 | types-beautifulsoup4 = "*"
26 | types-dateparser = "*"
27 | types-psycopg2 = "*"
28 | types-Pillow = "*"
29 | black = "*"
30 | 
31 | [packages]
32 | requests = ">=2"
33 | confluent-kafka = "*"
34 | python-snappy = "*"
35 | boto3 = "*"
36 | minio = "<7.0.0"
37 | psycopg2 = "*"
38 | bs4 = "*"
39 | python-magic = "*"
40 | ftfy = "*"
41 | internetarchive = "*"
42 | urlcanon = "*"
43 | Pillow = ">=3"
44 | python-poppler = ">=0.2.1"
45 | selectolax = ">=0.2"
46 | # constraining trafilatura to prevent a version conflict with
47 | # `charset_normalizer`, between htmldate and requests
48 | trafilatura = ">=1,<1.4"
49 | htmldate= ">=1,<1.4"
50 | pydantic = ">=1.7"
51 | dateparser = "*"
52 | braveblock = "*"
53 | dynaconf = ">=3"
54 | sentry-sdk = { version = ">=0.14.0", extras = [] }
55 | zstandard = "*"
56 | grobid_tei_xml = ">=0.1.2,<0.2.0"
57 | PyMuPDF = ">=1.19.0,<1.20.0"
58 | 
59 | [requires]
60 | python_version = "3.8"
61 | 
62 | [packages.globalwayback]
63 | version = ">=0.6.5"
64 | index = "ia"
65 | 
66 | [packages.wayback]
67 | version = ">=0.6.3"
68 | index = "ia"
69 | 


--------------------------------------------------------------------------------
/notes/old_extract_results.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | command:
 3 | 
 4 |     ./extraction_cdx_grobid.py         --hbase-table wbgrp-journal-extract-0-qa         --hbase-host bnewbold-dev.us.archive.org         --grobid-uri http://wbgrp-svc096.us.archive.org:8070 -r hadoop -c mrjob.conf --archive $VENVSHORT.tar.gz#venv hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx --jobconf mapred.line.input.format.linespermap=8000 --jobconf mapreduce.job.queuename=extraction
 5 | 
 6 | Started:    Wed Apr 11 05:54:54 UTC 2018
 7 | Finished:   Sun Apr 15 20:42:37 UTC 2018
 8 | (late saturday night PST fixed grobid parallelism)
 9 | 
10 | Elapsed: 110hrs, 47mins, 42sec
11 | 
12 | line counts:
13 |     error	3896
14 |     existing	311209
15 |     invalid	2311343	
16 |     skip	195641
17 |     success	1143094
18 |     total	3,965,183
19 | 
20 | ## Against prod table
21 | 
22 | Started:    Sun Apr 15 21:38:24 UTC 2018
23 | Finished:   Wed Apr 18 17:36:44 UTC 2018
24 | Elapsed:    67hrs, 58mins, 20sec
25 | 
26 | lines   
27 |     error   143
28 |     existing    213292
29 |     invalid 2311343
30 |     skip    195641
31 |     success 1,244,764
32 |     total   3,965,183
33 | 
34 | ## TARGETED
35 | 
36 | Job job_1513499322977_358533 failed with state FAILED due to: Task failed task_1513499322977_358533_m_000323
37 | 
38 | Started:	Thu Apr 19 05:21:25 UTC 2018
39 | Finished:	Sat Apr 21 11:01:58 UTC 2018
40 | Elapsed:	53hrs, 40mins, 33sec
41 | 
42 | lines   
43 |         error=4093
44 |         existing=55448
45 |         invalid=688873
46 |         skip=257533
47 |         success=1,282,053
48 |         total=2,288,000
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/python_hadoop/tests/files/small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Dummy Example File",
 3 |   "authors": [
 4 |     {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"},
 5 |     {"name": "J Doe", "given_name": "J", "surname": "Doe"}
 6 |   ],
 7 |   "journal": {
 8 |     "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
 9 |     "eissn": null,
10 |     "issn": null,
11 |     "issue": null,
12 |     "publisher": null,
13 |     "volume": null
14 |   },
15 |   "date": "2000",
16 |   "doi": null,
17 |   "citations": [
18 |     { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
19 |       "date": "2001",
20 |       "id": "b0",
21 |       "index": 0,
22 |       "issue": null,
23 |       "journal": "Letters in the Alphabet",
24 |       "publisher": null,
25 |       "title": "Everything is Wonderful",
26 |       "url": null,
27 |       "volume": "20"},
28 |     { "authors": [],
29 |       "date": "2011-03-28",
30 |       "id": "b1",
31 |       "index": 1,
32 |       "issue": null,
33 |       "journal": "The Dictionary",
34 |       "publisher": null,
35 |       "title": "All about Facts",
36 |       "url": null,
37 |       "volume": "14"}
38 |   ],
39 |   "abstract": "Everything you ever wanted to know about nothing",
40 |   "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
41 |   "acknowledgement": null,
42 |   "annex": null,
43 |   "fatcat_release": null,
44 |   "grobid_timestamp": "2018-04-02T00:31+0000",
45 |   "grobid_version": "0.5.1-SNAPSHOT"
46 | }
47 | 


--------------------------------------------------------------------------------
/sql/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | SHELL=/bin/bash -euo pipefail
 3 | TODAY ?= $(shell date --iso --utc)
 4 | DATADIR ?= /srv/sandcrawler/tasks/$(TODAY)
 5 | DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
 6 | DATABASE_URL ?= sandcrawler
 7 | 
 8 | .PHONY: help
 9 | help: ## Print info about all commands
10 | 	@echo "Commands:"
11 | 	@echo
12 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "    \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
13 | 
14 | .PHONY: create_datadir
15 | create_datadir:
16 | 	mkdir -p $(DATADIR)/
17 | 	sudo chmod a+rw $(DATADIR)/
18 | 
19 | $(DATADIR)/.DB_DUMP:
20 | 	sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip
21 | 	mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump
22 | 	touch $@
23 | 
24 | .PHONY: database-snapshot
25 | database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot
26 | 	@echo
27 | 
28 | $(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP
29 | 	ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))"
30 | 	ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump
31 | 	touch $@
32 | 
33 | .PHONY: upload-database-snapshot
34 | upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org
35 | 	@echo
36 | 


--------------------------------------------------------------------------------
/extra/hbase/howto.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Commands can be run from any cluster machine with hadoop environment config
 3 | set up. Most of these commands are run from the shell (start with `hbase
 4 | shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be
 5 | QA/prod tables, but there are not QA/prod clusters.
 6 | 
 7 | ## Create Table
 8 | 
 9 | Create column families (note: not all individual columns) with something like:
10 | 
11 |     create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'}
12 | 
13 | ## Run Thrift Server Informally
14 | 
15 | The Thrift server can technically be run from any old cluster machine that has
16 | Hadoop client stuff set up, using:
17 | 
18 |     hbase thrift start -nonblocking -c
19 | 
20 | Note that this will run version 0.96, while the actual HBase service seems to
21 | be running 0.98.
22 | 
23 | To interact with this config, use happybase (python) config:
24 | 
25 |     conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact")
26 |     # Test connection
27 |     conn.tables()
28 | 
29 | ## Queries From Shell
30 | 
31 | Fetch all columns for a single row:
32 | 
33 |     hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ'
34 | 
35 | Fetch multiple columns for a single row, using column families:
36 | 
37 |     hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file'
38 | 
39 | Scan a fixed number of rows (here 5) starting at a specific key prefix, all
40 | columns:
41 | 
42 |     hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'}
43 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-join-urls.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Author: Bryan Newbold <bnewbold@archive.org>
 4 | -- Date: May 2018
 5 | 
 6 | %default INPUT_CDX ''
 7 | %default INPUT_URLS ''
 8 | %default OUTPUT ''
 9 | 
10 | REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar;
11 | REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar;
12 | DEFINE SURTURL pigtools.SurtUrlKey();
13 | 
14 | set mapreduce.job.queuename default
15 | 
16 | urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray;
17 | surts = FOREACH urls GENERATE SURTURL(url) AS url_surt;
18 | surts = ORDER surts by url_surt ASC PARALLEL 10;
19 | surts = DISTINCT surts;
20 | 
21 | cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
22 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
23 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
24 | 
25 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
26 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
27 | cdx = FILTER cdx BY not cdx_surt matches '-';
28 | cdx = FILTER cdx BY httpstatus matches '200';
29 | cdx = FILTER cdx BY mimetype matches '.*pdf.*';
30 | 
31 | -- Core JOIN
32 | full_join = JOIN cdx BY cdx_surt, surts BY url_surt;
33 | 
34 | -- DISTINCT by sha1 column
35 | full_uniq = FOREACH (GROUP full_join BY sha1sum) {
36 |     r = TOP(1, 0, $1);
37 |     GENERATE FLATTEN(r);
38 | };
39 | 
40 | result = FOREACH full_uniq GENERATE cdxline;
41 | result = DISTINCT result;
42 | 
43 | STORE result INTO '$OUTPUT' USING PigStorage();
44 | 


--------------------------------------------------------------------------------
/sql/dump_reingest_bulk.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |     SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |     LEFT JOIN ingest_file_result ON
 7 |         ingest_file_result.base_url = ingest_request.base_url
 8 |         AND ingest_file_result.ingest_type = ingest_request.ingest_type
 9 |     WHERE
10 |         (ingest_request.ingest_type = 'pdf'
11 |             OR ingest_request.ingest_type = 'html')
12 |         AND ingest_file_result.hit = false
13 |         AND ingest_request.created < NOW() - '24 hour'::INTERVAL
14 |         AND ingest_request.created > NOW() - '181 day'::INTERVAL
15 |         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
16 |              OR ingest_request.ingest_request_source = 'fatcat-ingest')
17 |         AND (
18 |             ingest_file_result.status like 'spn2-%'
19 |             OR ingest_file_result.status like 'cdx-error'
20 |             OR ingest_file_result.status like 'petabox-error'
21 |         )
22 |         AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
23 |         AND ingest_file_result.status != 'spn2-error:filesize-limit'
24 |         AND ingest_file_result.status != 'spn2-error:not-found'
25 |         AND ingest_file_result.status != 'spn2-error:blocked-url'
26 |         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
27 |         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
28 |         AND ingest_file_result.status != 'spn2-error:unknown'
29 | ) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json';
30 | 
31 | ROLLBACK;
32 | 


--------------------------------------------------------------------------------
/sql/dump_reingest_terminalstatus.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |     SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |     LEFT JOIN ingest_file_result ON
 7 |         ingest_file_result.base_url = ingest_request.base_url
 8 |         AND ingest_file_result.ingest_type = ingest_request.ingest_type
 9 |     WHERE
10 |         ingest_file_result.hit = false
11 |         AND ingest_request.created < NOW() - '72 hour'::INTERVAL
12 |         AND ingest_request.created > NOW() - '10 day'::INTERVAL
13 |         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
14 |              OR ingest_request.ingest_request_source = 'fatcat-ingest')
15 |         AND ingest_file_result.status = 'terminal-bad-status'
16 |         AND (
17 |              ingest_file_result.terminal_status_code = 500
18 |              OR ingest_file_result.terminal_status_code = 502
19 |              OR ingest_file_result.terminal_status_code = 503
20 |              OR ingest_file_result.terminal_status_code = 429
21 |              OR ingest_file_result.terminal_status_code = 404
22 |         )
23 | 	AND (
24 | 		ingest_request.base_url LIKE 'https://doi.org/10.3390/%'
25 | 		OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%'
26 | 		OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%'
27 | 	)
28 | ) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json';
29 | 
30 | -- bulk re-tries would be:
31 | --      AND (ingest_request.ingest_request_source != 'fatcat-changelog'
32 | --           AND ingest_request.ingest_request_source != 'fatcat-ingest')
33 | 
34 | ROLLBACK;
35 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | // Dumps all the info needed to insert a file entity in Fatcat. Useful for
17 | // joining.
18 | class DumpFileMetaJob(args: Args) extends JobBase(args) with HBasePipeConversions {
19 | 
20 |   val metaPipe : TypedPipe[(String, String, String, Long)] = HBaseBuilder.build(args("hbase-table"),
21 |                      args("zookeeper-hosts"),
22 |                      List("file:cdx", "file:mime", "file:size"),
23 |                      SourceMode.SCAN_ALL)
24 |     .read
25 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size"))
26 |     .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null }
27 |     .map { case (key, cdx, mime, size) =>
28 |       (Bytes.toString(key.copyBytes()),
29 |        Bytes.toString(cdx.copyBytes()),
30 |        Bytes.toString(mime.copyBytes()),
31 |        Bytes.toLong(size.copyBytes()))
32 |     };
33 | 
34 |   metaPipe.write(TypedTsv[(String,String,String,Long)](args("output")))
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.tuple.Fields
 4 | import org.scalatest._
 5 | 
 6 | class HBaseBuilderTest extends FlatSpec with Matchers {
 7 |   "parseColSpecs()" should "work on legal nontrivial input" in {
 8 |     val (fams, fields) = HBaseBuilder.parseColSpecs(List("file:size", "file:cdx", "match0:status"))
 9 |     fams should have length 2
10 |     fields should have length 2
11 |     val fileIndex = fams.indexOf("file")
12 |     fileIndex should not be -1
13 |     fields(fileIndex) should be (new Fields("size", "cdx"))
14 |     val match0Index = fams.indexOf("match0")
15 |     match0Index should not be -1
16 |     fields(match0Index) should be (new Fields("status"))
17 |   }
18 | 
19 |   it should "work on empty input" in {
20 |     val (fams, fields) = HBaseBuilder.parseColSpecs(List())
21 |     fams should have length 0
22 |     fields should have length 0
23 |   }
24 | 
25 |   //scalastyle:off no.whitespace.before.left.bracket
26 |   it should "throw IllegalArgumentException on malformed input" in {
27 |     a [IllegalArgumentException] should be thrownBy {
28 |       HBaseBuilder.parseColSpecs(List("file_size"))
29 |     }
30 |   }
31 | 
32 |   it should "throw IllegalArgumentException on nonexistent family" in {
33 |     a [IllegalArgumentException] should be thrownBy {
34 |       HBaseBuilder.parseColSpecs(List("foo:bar"))
35 |     }
36 |   }
37 | 
38 |   it should "throw IllegalArgumentException on nonexistent column" in {
39 |     a [IllegalArgumentException] should be thrownBy {
40 |       HBaseBuilder.parseColSpecs(List("file:bar"))
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-01-14_bulk.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Generate ingest requests from arabesque:
 3 | 
 4 |     zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json
 5 | 
 6 |     zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json
 7 | 
 8 | 
 9 | Quick tests locally:
10 | 
11 |     time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json
12 |     time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json
13 | 
14 | These are all wayback success; looking good! Single threaded, from home laptop
15 | (over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even
16 | with 30x parallelism. Should re-test on actual server. GROBID pre-check should
17 | help?
18 | 
19 | With new bulk topic:
20 | 
21 |     head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
22 | 
23 | Ok, let them rip:
24 | 
25 |     cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
26 |     cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
27 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.pipe.Pipe
 4 | import com.twitter.scalding.Args
 5 | import com.twitter.scalding.Stat
 6 | import com.twitter.scalding.TypedPipe
 7 | import com.twitter.scalding.TypedTsv
 8 | import parallelai.spyglass.base.JobBase
 9 | 
10 | class GroupFatcatWorksJob(args: Args) extends JobBase(args) {
11 | 
12 |   val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler")
13 |   val joinedRowCount = Stat("joined-rows", "sandcrawler")
14 | 
15 |   val fatcatScorable : Scorable = new FatcatScorable()
16 |   val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable
17 |     .getInputPipe(args)
18 |     .map { r =>
19 |       fatcatRowCount.inc
20 |       r
21 |     }
22 | 
23 |   val joinedPipe = fatcatPipe
24 |     .addTrap(TypedTsv(args("output") + ".trapped"))
25 |     .join(fatcatPipe)
26 | 
27 |   // TypedTsv doesn't work over case classes.
28 |   joinedPipe
29 |     // filter out trivial self-matches (releases are identical)
30 |     .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
31 |       Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
32 |     }
33 |     .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
34 |       joinedRowCount.inc
35 |       new ReduceOutput(
36 |         slug,
37 |         Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
38 |         fatcatFeaturesLeft.json,
39 |         fatcatFeaturesRight.json)
40 |     }
41 |     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
42 |     .write(TypedTsv[(String, Int, String, String)](args("output")))
43 | }
44 | 


--------------------------------------------------------------------------------
/proposals/2021-09-13_src_ingest.md:
--------------------------------------------------------------------------------
 1 | 
 2 | File Ingest Mode: 'src'
 3 | =======================
 4 | 
 5 | Ingest type for "source" of works in document form. For example, tarballs of
 6 | LaTeX source and figures, as published on arxiv.org and Pubmed Central.
 7 | 
 8 | For now, presumption is that this would be a single file (`file` entity in
 9 | fatcat).
10 | 
11 | Initial mimetypes to allow:
12 | 
13 | - text/x-tex
14 | - application/xml
15 | - application/gzip
16 | - application/x-bzip
17 | - application/x-bzip2
18 | - application/zip
19 | - application/x-tar
20 | - application/msword
21 | - application/vnd.openxmlformats-officedocument.wordprocessingml.document
22 | 
23 | 
24 | ## Fatcat Changes
25 | 
26 | In the file importer, allow the additional mimetypes for 'src' ingest.
27 | 
28 | Might keep ingest disabled on the fatcat side, at least initially. Eg, until
29 | there is some scope of "file scope", or other ways of treating 'src' tarballs
30 | separate from PDFs or other fulltext formats.
31 | 
32 | 
33 | ## Ingest Changes
34 | 
35 | Allow additional terminal mimetypes for 'src' crawls.
36 | 
37 | 
38 | ## Examples
39 | 
40 |     arxiv:2109.00954v1
41 |     fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy
42 |     https://arxiv.org/format/2109.00954v1
43 |     https://arxiv.org/e-print/2109.00954v1
44 | 
45 |     arxiv:1912.03397v2
46 |     https://arxiv.org/format/1912.03397v2
47 |     https://arxiv.org/e-print/1912.03397v2
48 |     NOT: https://arxiv.org/pdf/1912.03397v2
49 | 
50 |     pmcid:PMC3767916
51 |     https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz
52 | 
53 | For PMC, will need to use one of the .csv file lists to get the digit prefixes.
54 | 


--------------------------------------------------------------------------------
/python/tests/files/small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Dummy Example File",
 3 |   "authors": [
 4 |     {
 5 |       "name": "Brewster Kahle",
 6 |       "given_name": "Brewster",
 7 |       "surname": "Kahle",
 8 |       "affiliation": {
 9 |         "department": "Faculty ofAgricultrial Engineering",
10 |         "laboratory": "Plant Physiology Laboratory",
11 |         "institution": "Technion-Israel Institute of Technology",
12 |         "address": {
13 |           "postCode": "32000",
14 |           "settlement": "Haifa",
15 |           "country": "Israel"
16 |         }
17 |       }
18 |     },
19 |     {"name": "J Doe", "given_name": "J", "surname": "Doe"}
20 |   ],
21 |   "journal": {
22 |     "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678"
23 |   },
24 |   "date": "2000",
25 |   "citations": [
26 |     { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
27 |       "date": "2001",
28 |       "id": "b0",
29 |       "index": 0,
30 |       "journal": "Letters in the Alphabet",
31 |       "pages": "1-11",
32 |       "title": "Everything is Wonderful",
33 |       "volume": "20"},
34 |     { "authors": [],
35 |       "date": "2011-03-28",
36 |       "id": "b1",
37 |       "index": 1,
38 |       "journal": "The Dictionary",
39 |       "title": "All about Facts",
40 |       "volume": "14"}
41 |   ],
42 |   "abstract": "Everything you ever wanted to know about nothing",
43 |   "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
44 |   "grobid_timestamp": "2018-04-02T00:31+0000",
45 |   "grobid_version": "0.5.1-SNAPSHOT",
46 |   "language_code": "en"
47 | }
48 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import scala.math
 4 | import scala.util.parsing.json.JSON
 5 | import scala.util.parsing.json.JSONObject
 6 | 
 7 | import cascading.flow.FlowDef
 8 | import cascading.tuple.Fields
 9 | import com.twitter.scalding._
10 | import com.twitter.scalding.typed.TDsl._
11 | 
12 | class BibjsonScorable extends Scorable {
13 | 
14 |   def getSource(args : Args) : Source = {
15 |     TextLine(args("bibjson-input"))
16 |   }
17 | 
18 |   def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
19 |     getSource(args).read
20 |       .toTypedPipe[String](new Fields("line"))
21 |       .map { BibjsonScorable.bibjsonToMapFeatures(_) }
22 |   }
23 | }
24 | 
25 | object BibjsonScorable {
26 |   def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = {
27 |     Scorable.jsonToMap(json) match {
28 |       case None => None
29 |       case Some(map) => {
30 |         if (map contains "title") {
31 |           val title = Scorable.getString(map, "title")
32 |           val doi = Scorable.getString(map, "doi")
33 |           val sha1 = Scorable.getString(map, "sha")
34 |           // TODO: year, authors (if available)
35 |           if (title == null || title.isEmpty) {
36 |             None
37 |           } else {
38 |             val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
39 |             sf.toSlug match {
40 |               case None => None
41 |               case Some(slug) => Some(MapFeatures(slug, sf.toString))
42 |             }
43 |           }
44 |         } else {
45 |           None
46 |         }
47 |       }
48 |     }
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/sql/backfill/backfill_file_meta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | This is a "one-time" tranform helper script for file_meta backfill into
 4 | sandcrawler postgresql.
 5 | 
 6 | Most of this file was copied from '../python/common.py'.
 7 | """
 8 | 
 9 | import json, os, sys, collections
10 | import psycopg2
11 | import psycopg2.extras
12 | 
13 | 
14 | def insert(cur, batch):
15 |     sql = """
16 |         INSERT INTO
17 |         file_meta
18 |         VALUES %s
19 |         ON CONFLICT DO NOTHING;
20 |     """
21 |     res = psycopg2.extras.execute_values(cur, sql, batch)
22 | 
23 | def stdin_to_pg():
24 |     # no host means it will use local domain socket by default
25 |     conn = psycopg2.connect(database="sandcrawler", user="postgres")
26 |     cur = conn.cursor()
27 |     counts = collections.Counter({'total': 0})
28 |     batch = []
29 |     for l in sys.stdin:
30 |         if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
31 |             print("Progress: {}...".format(counts))
32 |         counts['raw_lines'] += 1
33 |         if not l.strip():
34 |             continue
35 |         info = l.split("\t")
36 |         if not info:
37 |             continue
38 |         assert len(info) == 5
39 |         info[-1] = info[-1].strip() or None
40 |         batch.append(info)
41 |         counts['total'] += 1
42 |         if len(batch) >= 1000:
43 |             insert(cur, batch)
44 |             conn.commit()
45 |             batch = []
46 |             counts['batches'] += 1
47 |     if batch:
48 |         insert(cur, batch)
49 |         batch = []
50 |     conn.commit()
51 |     cur.close()
52 |     print("Done: {}".format(counts))
53 | 
54 | if __name__=='__main__':
55 |     stdin_to_pg()
56 | 


--------------------------------------------------------------------------------
/notes/ingest/2022-03_oaipmh.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
 3 | 
 4 | Note that Martin excluded many Indonesian endpoints, will need to follow-up on
 5 | those.
 6 | 
 7 | ## Prep
 8 | 
 9 | Fetch metadata snapshot:
10 | 
11 |     wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
12 | 
13 |     wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
14 | 
15 | Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
16 | 
17 |     zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
18 |         | rg -v 'oai:kb.dk:' \
19 |         | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
20 |         | rg -v 'oai:hispana.mcu.es:' \
21 |         | rg -v 'oai:bnf.fr:' \
22 |         | rg -v 'oai:ukm.si:' \
23 |         | rg -v 'oai:biodiversitylibrary.org:' \
24 |         | rg -v 'oai:hsp.org:' \
25 |         | rg -v 'oai:repec:' \
26 |         | rg -v 'oai:n/a:' \
27 |         | rg -v 'oai:quod.lib.umich.edu:' \
28 |         | rg -v 'oai:americanae.aecid.es:' \
29 |         | rg -v 'oai:www.irgrid.ac.cn:' \
30 |         | rg -v 'oai:espace.library.uq.edu:' \
31 |         | rg -v 'oai:edoc.mpg.de:' \
32 |         | rg -v 'oai:bibliotecadigital.jcyl.es:' \
33 |         | rg -v 'oai:repository.erciyes.edu.tr:' \
34 |         | rg -v 'oai:krm.or.kr:' \
35 |         | ./scripts/oai2ingestrequest.py - \
36 |         | pv -l \
37 |         | gzip \
38 |         > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
39 | 
40 | These failed to transform in the expected way; a change in JSON schema from last time?
41 | 


--------------------------------------------------------------------------------
/pig/join-cdx-sha1.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Author: Bryan Newbold <bnewbold@archive.org>
 4 | -- Date: December 2020
 5 | --
 6 | -- This pig script is intended to run agains the full (many TByte) GWB CDX, and
 7 | -- catch captures that match exact SHA1 (b32 encoded), regardless of mimetype.
 8 | --
 9 | -- The process is to filter the CDX for non-revisit HTTP 200s, sort this by
10 | -- SHA1 digest, then join with the (pre-sorted) SHA1 -- b32 input list, and dump
11 | -- output.
12 | 
13 | %default INPUT_CDX ''
14 | %default INPUT_DIGEST ''
15 | %default OUTPUT ''
16 | 
17 | set mapreduce.job.queuename default
18 | 
19 | digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray;
20 | digests = ORDER digests by sha1b32 ASC PARALLEL 20;
21 | digests = DISTINCT digests;
22 | 
23 | cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
24 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
25 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
26 | 
27 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
28 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1b32, cdxline;
29 | cdx = FILTER cdx BY not cdx_surt matches '-';
30 | cdx = FILTER cdx BY httpstatus matches '200';
31 | cdx = FILTER cdx BY not mimetype matches 'warc/revisit';
32 | cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40;
33 | 
34 | -- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency
35 | 
36 | -- Core JOIN
37 | full_join = JOIN cdx BY sha1b32, digests BY sha1b32;
38 | 
39 | -- TODO: at most, say 5 CDX lines per sha1b32?
40 | 
41 | result = FOREACH full_join GENERATE cdxline;
42 | 
43 | STORE result INTO '$OUTPUT' USING PigStorage();
44 | 


--------------------------------------------------------------------------------
/pig/tests/test_join_cdx.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | import tempfile
 5 | import subprocess
 6 | from pighelper import PigTestHelper, count_lines
 7 | 
 8 | class TestJoinCDXSha1(PigTestHelper):
 9 | 
10 |     def run_pig_join(self, script_path, cdx_file, digest_file, **kwargs):
11 |         """Convenience helper around run_pig().
12 |         
13 |         INPUT parameter is set to in_file.
14 |         OUTPUT parameter is set to a random file.
15 |         Any keyword args are passed as parameters.
16 |         """
17 | 
18 |         pargs = []
19 |         for key, value in kwargs.items():
20 |             pargs.append('-p')
21 |             pargs.append('{}={}'.format(key, value))
22 | 
23 |         out_file = tempfile.mktemp(dir=self._tmpdir)
24 |         params = [
25 |             '-f', script_path,
26 |             '-p', 'INPUT_CDX={}'.format(cdx_file),
27 |             '-p', 'INPUT_DIGEST={}'.format(digest_file),
28 |             '-p', 'OUTPUT={}'.format(out_file),
29 |             ] + pargs
30 |         status = self.run_pig_raw(params)
31 |         assert status.returncode == 0
32 |         # Capture all the part-r-* files
33 |         print("out_file: {}".format(out_file))
34 |         subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True)
35 |         sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True)
36 |         out = sub.stdout.decode('utf-8')
37 |         print(out)
38 |         return out
39 | 
40 |     # TODO: helper to verify that output matches an expected file
41 | 
42 |     def test_thing(self):
43 |         r = self.run_pig_join("join-cdx-sha1.pig", "tests/files/example.cdx", "tests/files/example.sha1b32")
44 |         assert count_lines(r) == 4
45 | 


--------------------------------------------------------------------------------
/proposals/20201012_no_capture.md:
--------------------------------------------------------------------------------
 1 | 
 2 | status: work-in-progress
 3 | 
 4 | NOTE: as of December 2022, bnewbold can't remember if this was fully
 5 | implemented or not.
 6 | 
 7 | Storing no-capture missing URLs in `terminal_url`
 8 | =================================================
 9 | 
10 | Currently, when the bulk-mode ingest code terminates with a `no-capture`
11 | status, the missing URL (which is not in GWB CDX) is not stored in
12 | sandcrawler-db. This proposed change is to include it in the existing
13 | `terminal_url` database column, with the `terminal_status_code` and
14 | `terminal_dt` columns empty.
15 | 
16 | The implementation is rather simple:
17 | 
18 | - CDX lookup code path should save the *actual* final missing URL (`next_url`
19 |   after redirects) in the result object's `terminal_url` field
20 | - ensure that this field gets passed through all the way to the database on the
21 |   `no-capture` code path
22 | 
23 | This change does change the semantics of the `terminal_url` field somewhat, and
24 | could break existing assumptions, so it is being documented in this proposal
25 | document.
26 | 
27 | 
28 | ## Alternatives
29 | 
30 | The current status quo is to store the missing URL as the last element in the
31 | "hops" field of the JSON structure. We could keep this and have a convoluted
32 | pipeline that would read from the Kafka feed and extract them, but this would
33 | be messy. Eg, re-ingesting would not update the old kafka messages, so we could
34 | need some accounting of consumer group offsets after which missing URLs are
35 | truly missing.
36 | 
37 | We could add a new `missing_url` database column and field to the JSON schema,
38 | for this specific use case. This seems like unnecessary extra work.
39 | 
40 | 


--------------------------------------------------------------------------------
/notes/examples/dataset_examples.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ### ArchiveOrg: CAT dataset
 3 | 
 4 | <https://archive.org/details/CAT_DATASET>
 5 | 
 6 | `release_36vy7s5gtba67fmyxlmijpsaui`
 7 | 
 8 | ###
 9 | 
10 | <https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
11 | 
12 | doi:10.1371/journal.pone.0120448
13 | 
14 | Single .rar file
15 | 
16 | ### Dataverse
17 | 
18 | <https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
19 | 
20 | Single excel file
21 | 
22 | ### Dataverse
23 | 
24 | <https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
25 | 
26 | doi:10.7910/DVN/CLSFKX
27 | 
28 | Mulitple files; multiple versions?
29 | 
30 | API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
31 | 
32 |     .data.id
33 |     .data.latestVersion.datasetPersistentId
34 |     .data.latestVersion.versionNumber, .versionMinorNumber
35 |     .data.latestVersion.files[]
36 |         .dataFile
37 |             .contentType (mimetype)
38 |             .filename
39 |             .filesize (int, bytes)
40 |             .md5
41 |             .persistendId
42 |             .description
43 |         .label (filename?)
44 |         .version
45 | 
46 | Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
47 | 
48 | Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
49 | 
50 | Dataverse refs:
51 | - 'doi' and 'hdl' are the two persistentId styles
52 | - file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
53 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | This directory contains `sandcrawler` python code for ingest pipelines, batch
 3 | processing, PDF extraction, etc.
 4 | 
 5 | 
 6 | ## Development Quickstart
 7 | 
 8 | As of December 2022, working with this code requires:
 9 | 
10 | - Python 3.8 (specifically, due to version specification in `pipenv`)
11 | - `pipenv` for python dependency management
12 | - generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
13 | - poppler (PDF processing library)
14 | - libmagic
15 | - libsodium
16 | - access to IA internal packages (`devpi.us.archive.org`), specifically for
17 |   globalwayback and related packages
18 | 
19 | In production and CI we use Ubuntu Focal (20.04). The CI script for this
20 | repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
21 | of dependencies for both development and deployment. Note that our CI system
22 | runs from our cluster, which resolves the devpi access issue. For developer
23 | laptops, you may need `sshuttle` or something similar set up to do initial
24 | package pulls.
25 | 
26 | It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
27 | working with pipenv. You can include this in a `.env` file.
28 | 
29 | There is a Makefile which helps with the basics. Eg:
30 | 
31 |     # install deps using pipenv
32 |     make deps
33 | 
34 |     # run python tests
35 |     make test
36 | 
37 |     # run code formatting and lint checks
38 |     make fmt lint
39 | 
40 | Sometimes when developing it is helpful to enter a shell with pipenv, eg:
41 | 
42 |     pipenv shell
43 | 
44 | Often when developing it is helpful (or necessary) to set environment
45 | variables. `pipenv shell` will read from `.env`, so you can copy and edit
46 | `example.env`, and it will be used in tests, `pipenv shell`, etc.
47 | 


--------------------------------------------------------------------------------
/python/scripts/ingestrequest_row2json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | This script is used to turn ingest request postgres rows (in JSON export
 4 | format) back in to regular ingest request JSON.
 5 | 
 6 | The only difference is the name and location of some optional keys.
 7 | """
 8 | 
 9 | import argparse
10 | import json
11 | import sys
12 | 
13 | 
14 | def transform(row):
15 |     """
16 |     dict-to-dict
17 |     """
18 |     row.pop("created", None)
19 |     extra = row.pop("request", None) or {}
20 |     for k in ("ext_ids", "edit_extra"):
21 |         if k in extra:
22 |             row[k] = extra[k]
23 |     if "release_ident" in extra:
24 |         row["fatcat"] = dict(release_ident=extra["release_ident"])
25 |     return row
26 | 
27 | 
28 | def run(args):
29 |     for l in args.json_file:
30 |         if not l.strip():
31 |             continue
32 |         try:
33 |             req = transform(json.loads(l))
34 |         except Exception as e:
35 |             print(e, file=sys.stderr)
36 |             print(l, file=sys.stderr)
37 |             continue
38 |         if args.force_recrawl:
39 |             req["force_recrawl"] = True
40 |         print(json.dumps(req, sort_keys=True))
41 | 
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
45 |     parser.add_argument(
46 |         "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
47 |     )
48 |     parser.add_argument(
49 |         "--force-recrawl",
50 |         action="store_true",
51 |         help="whether to add recrawl (SPNv2) flag to request",
52 |     )
53 |     subparsers = parser.add_subparsers()
54 | 
55 |     args = parser.parse_args()
56 | 
57 |     run(args)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | import scala.util.parsing.json.JSONObject
16 | 
17 | // Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV
18 | // columns: one is key, second is JSON). Used for partner delivery/sharing
19 | class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions {
20 | 
21 |   val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"),
22 |                      args("zookeeper-hosts"),
23 |                      List("file:cdx", "grobid0:tei_xml"),
24 |                      SourceMode.SCAN_ALL)
25 |     .read
26 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml"))
27 |     .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null }
28 |     .map { case (key, cdx, tei_xml) =>
29 |       (Bytes.toString(key.copyBytes()),
30 |        JSONObject(
31 |         Map(
32 |           "pdf_hash" -> Bytes.toString(key.copyBytes()),
33 |           "cdx_metadata" -> Bytes.toString(cdx.copyBytes()),
34 |           "tei_xml" -> Bytes.toString(tei_xml.copyBytes())
35 |         )).toString
36 |       )
37 |     };
38 | 
39 |   metaPipe.write(TypedTsv[(String,String)](args("output")))
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | image: ubuntu:focal
 3 | 
 4 | variables:
 5 |   LC_ALL: "C.UTF-8"
 6 |   LANG: "C.UTF-8"
 7 |   DEBIAN_FRONTEND: "noninteractive"
 8 | 
 9 | before_script:
10 |   - apt update -qy
11 |   - apt install -y --no-install-recommends apt-transport-https software-properties-common curl dirmngr gpg-agent
12 |   # scala-sbt.org APT signing key
13 |   - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0x2EE0EA64E40A89B84B2DF73499E82A75642AC823
14 |   - apt-add-repository -y "deb https://repo.scala-sbt.org/scalasbt/debian all main"
15 |   - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget
16 |   - pipenv --version
17 | 
18 | test_python:
19 |   script:
20 |     - cd python
21 |     - cp example.env .env
22 |     - pipenv install --dev --deploy
23 |     - make coverage
24 |     - make lint
25 | 
26 | test_python_hadoop:
27 |   when: manual
28 |   script:
29 |     - cd python_hadoop
30 |     - pipenv install --dev --deploy
31 |     - pipenv run pytest --cov
32 | 
33 | # needs fixing; some upstream com.hadoop.gplcompression#hadoop-lzo;0.4.16: java.lang.NullPointerException
34 | # change happened
35 | test_scalding:
36 |   when: manual
37 |   script:
38 |     - ./please -h
39 |     - cd scalding
40 |     - sbt -mem 1024 test
41 |     - sbt -mem 1024 assembly
42 | 
43 | # Needs fixing
44 | test_pig:
45 |   when: manual
46 |   script:
47 |     - ./fetch_hadoop.sh
48 |     - cd pig
49 |     - pipenv install --dev --deploy
50 |     - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
51 | 


--------------------------------------------------------------------------------
/notes/ingest/NEXT.md:
--------------------------------------------------------------------------------
 1 | 
 2 | biorxiv
 3 | medrxiv
 4 |     doi:10.1101\/20*
 5 | 
 6 | persee.fr 147k
 7 |     publisher:persee in_ia:false is_oa:true
 8 |     https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013
 9 | 
10 | cairn.info: 161k
11 |     doi_prefix:10.3917 in_ia:false is_oa:true
12 |     https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm
13 |     https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm
14 | 
15 | IOP OA: 169k
16 |     doi_prefix:10.1088 is_oa:true in_ia:false
17 | 
18 | indian journals platform? 124k
19 |     doi_prefix:10.4103 in_ia:false is_oa:true
20 |     http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad
21 |     http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki
22 | 
23 | openedition? 48k
24 |     doi_prefix:10.4000 is_oa:true in_ia:false
25 | 
26 | german medical science (GMS) 28k
27 |     doi_prefix:10.3205 in_ia:false is_oa:true
28 |     https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml
29 | 
30 | siberian chemistry 28k
31 |     doi_prefix:10.2298 in_ia:false is_oa:true
32 |     http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H
33 | 
34 | jalc oa doi: 82k
35 |     doi_registrar:jalc in_ia:false is_oa:true
36 | 
37 | sage OA papers
38 |     https://journals.sagepub.com/doi/10.1177/034003529802400510
39 | 
40 | Scientific Reports: 25k
41 |     in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm"
42 | 
43 | U Toronto press: 23k
44 |     publisher:"Toronto Press" in_ia:false is_oa:true
45 |     has an annoying bounce page
46 | 
47 | ASHA (speech-language-hearing association): 7k
48 |     publisher:Speech-Language-Hearing in_ia:false is_oa:true
49 | 
50 | MIT press journals
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/ScoreJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.pipe.Pipe
 4 | import com.twitter.scalding.Args
 5 | import com.twitter.scalding.Stat
 6 | import com.twitter.scalding.TypedPipe
 7 | import com.twitter.scalding.TypedTsv
 8 | import parallelai.spyglass.base.JobBase
 9 | 
10 | class ScoreJob(args: Args) extends JobBase(args) {
11 | 
12 |   val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
13 |   val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
14 |   val joinedRowCount = Stat("joined-rows", "sandcrawler")
15 | 
16 |   val grobidScorable : Scorable = new GrobidScorable()
17 |   val crossrefScorable : Scorable = new CrossrefScorable()
18 |   val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
19 |     .getInputPipe(args)
20 |     .map { r =>
21 |       grobidRowCount.inc
22 |       r
23 |     }
24 |   val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
25 |     .getInputPipe(args)
26 |     .map { r =>
27 |       crossrefRowCount.inc
28 |       r
29 |     }
30 | 
31 |   val joinedPipe = grobidPipe
32 |     .addTrap(TypedTsv(args("output") + ".trapped"))
33 |     .join(crossrefPipe)
34 | 
35 |   // TypedTsv doesn't work over case classes.
36 |   joinedPipe
37 |     .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
38 |       joinedRowCount.inc
39 |       //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
40 |       new ReduceOutput(
41 |         slug,
42 |         Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
43 |         grobidFeatures.json,
44 |         crossrefFeatures.json)
45 |     }
46 |     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
47 |     .write(TypedTsv[(String, Int, String, String)](args("output")))
48 | }
49 | 


--------------------------------------------------------------------------------
/notes/tasks/2020-01-06_heuristic_cdx.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Wanted to include a large number of additional CDX lines based on regex
 3 | pattern. These are primarily .edu domains with things that look like user
 4 | accounts *and* .pdf file extensions in the path.
 5 | 
 6 | ## Commands
 7 | 
 8 | aitio:/fast/gwb_pdfs
 9 | 
10 |   pdfs/gwb-pdf-20191005172329-url-heuristics-edu
11 |   pdfs/gwb-pdf-20191005172329-url-heuristics
12 | 
13 | 
14 | to filter as url/sha1 uniq:
15 | 
16 |     cat raw.cdx | sort -u -t' ' -k3,6 -S 4G > uniq.cdx
17 | 
18 |     cat gwb-pdf-20191005172329-url-heuristics-edu/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
19 |     cat gwb-pdf-20191005172329-url-heuristics/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
20 | 
21 |     7241795  gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
22 |     41137888 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
23 | 
24 |     cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
25 |     7241795
26 | 
27 |     cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
28 |     41137888
29 | 
30 |     ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
31 |     Worker: Counter({'total': 7239153, 'insert-cdx': 6845283, 'update-cdx': 0})
32 |     CDX lines pushed: Counter({'total': 7241795, 'pushed': 7239153, 'skip-parse': 2603, 'skip-mimetype': 39})
33 | 
34 |     ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
35 |     Worker: Counter({'total': 41030360, 'insert-cdx': 22430064, 'update-cdx': 0})
36 |     CDX lines pushed: Counter({'total': 41137888, 'pushed': 41030360, 'skip-mimetype': 87341, 'skip-parse': 20187})
37 | 
38 | 


--------------------------------------------------------------------------------
/pig/filter-cdx-paper-pdfs.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Tries to filter down a large CDX file to a subset that is likely to be
 3 | -- journal article content, based on SURT regex patterns.
 4 | ---
 5 | -- Author: Bryan Newbold <bnewbold@archive.org>
 6 | -- Date: May 2018
 7 | 
 8 | 
 9 | %default INPUT ''
10 | %default OUTPUT ''
11 | 
12 | set mapreduce.job.queuename default
13 | 
14 | cdx = LOAD '$INPUT' AS cdxline:chararray;
15 | cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
16 | cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
17 | 
18 | cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
19 | cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
20 | cdx = FILTER cdx BY not surt matches '-';
21 | cdx = FILTER cdx BY httpstatus matches '200';
22 | cdx = FILTER cdx BY mimetype matches '.*pdf.*';
23 | 
24 | -- This is the core regex
25 | cdx = FILTER cdx
26 |         -- academic domains; personal (tilde) directories
27 |         BY surt matches '(edu,|..,edu|..,ac,).*\\).*\\/~.*'
28 | 
29 |         -- words in URL
30 |         OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*'
31 | 
32 |         -- words in domains 
33 |         OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*'
34 | 
35 |         -- DOI-like pattern in URL
36 |         OR surt matches '.*\\).*/10\\.\\d{3,5}/.*';
37 | 
38 | -- DISTINCT by sha1 column
39 | cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
40 |     r = TOP(1, 0, $1);
41 |     GENERATE FLATTEN(r);
42 | };
43 | 
44 | cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
45 | cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
46 | STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
47 | 
48 | 


--------------------------------------------------------------------------------
/python/scripts/manifest_converter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
 4 | "match" JSON objects which can be imported into fatcat with matched_import.py
 5 | 
 6 | This was used to convert this manifest:
 7 | 
 8 |     https://archive.org/details/ia_papers_manifest_2018-01-25/
 9 | 
10 | to JSON format for fast fatcat importing.
11 | """
12 | 
13 | import json
14 | import sqlite3
15 | import sys
16 | 
17 | # iterate over rows in files metadata...
18 | # 1. select all identified DOIs
19 | #   => filter based on count
20 | # 2. select all file metadata
21 | # 3. output object
22 | 
23 | 
24 | def or_none(s):
25 |     if s is None:
26 |         return None
27 |     elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
28 |         return None
29 |     return s
30 | 
31 | 
32 | def process_db(db_path):
33 | 
34 |     db = sqlite3.connect(db_path)
35 | 
36 |     for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
37 |         sha1 = row[0]
38 |         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
39 |         dois = [d[0] for d in dois]
40 |         if not dois:
41 |             continue
42 |         urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
43 |         if not urls:
44 |             continue
45 |         cdx = [dict(url=row[0], dt=row[1]) for row in urls]
46 |         obj = dict(
47 |             sha1=sha1,
48 |             mimetype=or_none(row[1]),
49 |             size=(or_none(row[2]) and int(row[2])),
50 |             md5=or_none(row[3]),
51 |             dois=dois,
52 |             cdx=cdx,
53 |         )
54 |         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
55 |         print(json.dumps(obj))
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     process_db(sys.argv[1])
60 | 


--------------------------------------------------------------------------------
/kafka/howto_rebalance.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Rebalance Storage Between Brokers (kafka-manager web)
 3 | 
 4 | For each topic you want to rebalance (eg, the large or high-throughput ones),
 5 | go to the topic page and do the blue "reassign partitions" button (or
 6 | potentially "generate" or "manual").
 7 | 
 8 | Monitor progress with the "Reassign Partitions" link at top of the page.
 9 | 
10 | Finally, run a preferred replica election after partition movement is complete.
11 | 
12 | ## Rebalance Storage Between Brokers (CLI)
13 | 
14 | For example, after adding or removing brokers from the cluster.
15 | 
16 | Create a list of topics to move, and put it in `/tmp/topics_to_move.json`:
17 | 
18 |     {
19 |       "version": 1,
20 |       "topics": [
21 |         {"topic": "sandcrawler-shadow.grobid-output"},
22 |         {"topic": "fatcat-prod.api-crossref"}
23 |       ]
24 |     }
25 | 
26 | On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then
27 | inspect the output:
28 | 
29 |     ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json
30 |     cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json
31 |     cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json
32 |     cat /tmp/reassignment-plan.json | rg '^\{' | jq .
33 | 
34 | If that looks good, start the rebalance:
35 | 
36 |     ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute
37 | 
38 | Then monitor progress:
39 | 
40 |     ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify
41 | 
42 | Finally, run a preferred replica election after partition movement is complete.
43 | Currently do this through the web interface (linked above).
44 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | // Dumps the SHA1 key and grobid0:metadata columns, plus file metadata needed
17 | // to insert into fatcat. Used, eg, as part of long-tail mellon pipeline.
18 | class DumpGrobidMetaInsertableJob(args: Args) extends JobBase(args) with HBasePipeConversions {
19 | 
20 |   val metaPipe : TypedPipe[(String, String, String, Long, String)] = HBaseBuilder.build(args("hbase-table"),
21 |                      args("zookeeper-hosts"),
22 |                      List("file:cdx", "file:mime", "file:size", "grobid0:metadata"),
23 |                      SourceMode.SCAN_ALL)
24 |     .read
25 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size", "metadata"))
26 |     .filter { case (_, cdx, mime, size, metadata) => cdx != null && mime != null && size != null && metadata != null }
27 |     .map { case (key, cdx, mime, size, metadata) =>
28 |       (Bytes.toString(key.copyBytes()),
29 |        Bytes.toString(cdx.copyBytes()),
30 |        Bytes.toString(mime.copyBytes()),
31 |        Bytes.toLong(size.copyBytes()),
32 |        Bytes.toString(metadata.copyBytes())
33 |       )
34 |     };
35 | 
36 |   metaPipe.write(TypedTsv[(String,String,String,Long,String)](args("output")))
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/python/scripts/grobid_affiliations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
 4 | output, converts the XML to JSON, filters out raw affiliation strings, and
 5 | dumps these as JSON subset.
 6 | 
 7 | Run in bulk like:
 8 | 
 9 |     ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
10 | """
11 | 
12 | import json
13 | import sys
14 | 
15 | from grobid_tei_xml import parse_document_xml
16 | 
17 | 
18 | def parse_hbase(line):
19 |     line = line.split("\t")
20 |     assert len(line) == 2
21 |     sha1hex = line[0]
22 |     obj = json.loads(line[1])
23 |     tei_xml = obj["tei_xml"]
24 |     return sha1hex, tei_xml
25 | 
26 | 
27 | def parse_pg(line):
28 |     obj = json.loads(line)
29 |     return obj["sha1hex"], obj["tei_xml"]
30 | 
31 | 
32 | def run(mode="hbase"):
33 |     for line in sys.stdin:
34 |         if mode == "hbase":
35 |             sha1hex, tei_xml = parse_hbase(line)
36 |         elif mode == "pg":
37 |             sha1hex, tei_xml = parse_pg(line)
38 |         else:
39 |             raise NotImplementedError("parse mode: {}".format(mode))
40 | 
41 |         tei_doc = parse_document_xml(tei_xml)
42 |         tei_doc.remove_encumbered()
43 |         obj = tei_doc.to_legacy_dict()
44 | 
45 |         affiliations = []
46 |         for author in obj["authors"]:
47 |             if author.get("affiliation"):
48 |                 affiliations.append(author["affiliation"])
49 |         if affiliations:
50 |             # don't duplicate affiliations; only the unique ones
51 |             affiliations = list(set([json.dumps(a) for a in affiliations]))
52 |             affiliations = [json.loads(a) for a in affiliations]
53 |             print("\t".join([sha1hex, json.dumps(affiliations)]))
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     run()
58 | 


--------------------------------------------------------------------------------
/sql/dump_reingest_spn.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |     SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |     LEFT JOIN ingest_file_result ON
 7 |         ingest_file_result.base_url = ingest_request.base_url
 8 |         AND ingest_file_result.ingest_type = ingest_request.ingest_type
 9 |     WHERE
10 |         (ingest_request.ingest_type = 'pdf'
11 |             OR ingest_request.ingest_type = 'html'
12 |             OR ingest_request.ingest_type = 'xml'
13 |             OR ingest_request.ingest_type = 'component')
14 |         AND ingest_file_result.hit = false
15 |         AND ingest_request.created < NOW() - '6 hour'::INTERVAL
16 |         AND ingest_request.created > NOW() - '180 day'::INTERVAL
17 |         AND ingest_request.ingest_request_source = 'savepapernow-web'
18 |         AND (
19 |             ingest_file_result.status like 'spn2-%'
20 |             -- OR ingest_file_result.status = 'cdx-error'
21 |             -- OR ingest_file_result.status = 'wayback-error'
22 |             -- OR ingest_file_result.status = 'wayback-content-error'
23 |             OR ingest_file_result.status = 'petabox-error'
24 |             -- OR ingest_file_result.status = 'gateway-timeout'
25 |             OR ingest_file_result.status = 'no-capture'
26 |         )
27 |         AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
28 |         AND ingest_file_result.status != 'spn2-error:filesize-limit'
29 |         AND ingest_file_result.status != 'spn2-error:not-found'
30 |         AND ingest_file_result.status != 'spn2-error:blocked-url'
31 |         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
32 |         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
33 |         AND ingest_file_result.status != 'spn2-error:unknown'
34 | ) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
35 | 
36 | ROLLBACK;
37 | 


--------------------------------------------------------------------------------
/extra/nginx/fatcat-blobs:
--------------------------------------------------------------------------------
 1 | 
 2 | server {
 3 |     listen 80;
 4 |     listen [::]:80;
 5 |     listen 443 ssl http2;
 6 |     listen [::]:443 ssl http2;
 7 |     server_name  blobs.fatcat.wiki;
 8 | 
 9 |     ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem;
10 |     ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem;
11 | 
12 |     #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'";
13 |     add_header X-Frame-Options "SAMEORIGIN";       # 'always' if nginx > 1.7.5
14 |     add_header X-Content-Type-Options "nosniff";   # 'always' if nginx > 1.7.5
15 |     add_header X-Xss-Protection "1";
16 |     # Enable STS with one year period (breaks http; optional)
17 |     #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains";
18 | 
19 |     error_log   /var/log/nginx/fatcat-errors.log;
20 |     access_log  /dev/null;
21 | 
22 |     if ($scheme = http) {
23 |         return 301 https://$server_name$request_uri;
24 |     }
25 | 
26 |     location /unpaywall/ {
27 |         if ($request_method !~ "GET") {
28 |             return 403;
29 |             break;
30 |         }
31 | 
32 |         #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args;
33 |         proxy_pass http://207.241.227.141:9000$uri$is_args$args;
34 |         proxy_redirect off;
35 | 
36 |         proxy_set_header  X-Real-IP  $remote_addr;
37 |         proxy_set_header  X-Forwarded-For $proxy_add_x_forwarded_for;
38 |         proxy_set_header  Host $http_host;
39 |     }
40 | 
41 |     location / {
42 |         default_type text/plain;
43 |         return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead';
44 |     }
45 | 
46 |     # Let's Encrypt SSL Certs
47 |     location /.well-known/acme-challenge/ {
48 |         root /var/www/letsencrypt;
49 |         autoindex off;
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-03_s2.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
 3 | ingested, as well as any previous existing content.
 4 | 
 5 | Also, there are a bunch of PDF outlinks to the web; should do S2-specific
 6 | matching and ingest of those.
 7 | 
 8 | There are a few categories of paper from pdfs.s.o:
 9 | 
10 | 1. we had previous GWB crawl, didn't re-crawl
11 | 2. we had PDF from elsewhere on the web, didn't re-crawl
12 | 3. crawled successfully
13 | 4. crawl failed
14 | 
15 | In this ingest, want to get all of categories 1 and 3. Could try to do this by
16 | dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
17 | and join that against the ingest request list.
18 | 
19 | For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
20 | 
21 | ## Create Seedlist
22 | 
23 |     zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
24 |     zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
25 | 
26 |     zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
27 |     zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
28 | 
29 |     zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
30 |     zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
31 | 
32 |     zcat s2_external_ingestrequest.json.gz | wc -l
33 |     41201427
34 |     zcat s2_hosted_ingestrequest.json.gz | wc -l
35 |     23345761
36 | 


--------------------------------------------------------------------------------
/scalding/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains Hadoop map/reduce jobs written in Scala (compiled to
 2 | the JVM) using the Scalding framework. Scalding builds on the Java Cascading
 3 | library, which itself builds on the Java Hadoop libraries.
 4 | 
 5 | See the other markdown files in this directory for more background and tips.
 6 | 
 7 | ## Dependencies
 8 | 
 9 | To develop locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build
10 | tool, and might need (exactly) Scala version 2.11.8.
11 | 
12 | On a debian/ubuntu machine:
13 | 
14 |     echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
15 |     sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
16 |     sudo apt-get update
17 |     sudo apt install scala sbt
18 | 
19 | It's also helpful to have a local copy of the `hadoop` binary for running
20 | benchmarks. The `fetch_hadoop.sh` script in the top level directory will fetch
21 | an appropriate version.
22 | 
23 | ## Building and Running
24 | 
25 | You can run `sbt` commands individually:
26 | 
27 |     # run all test
28 |     sbt test
29 | 
30 |     # build a jar (also runs tests)
31 |     sbt assembly
32 | 
33 | Or you can start a session and run commands within that, which is *much*
34 | faster:
35 | 
36 |     sbt -mem 2048
37 | 
38 |     sbt> test
39 |     sbt> assembly
40 |     sbt> testOnly sandcrawler.SomeTestClassName
41 | 
42 | On the cluster, you usually use the `please` script to kick off jobs. Be sure
43 | to build the jars first, or pass `--rebuild` to do it automatically. You need
44 | `hadoop` on your path for this.
45 | 
46 | ## Troubleshooting
47 | 
48 | If your `sbt` task fails with this error:
49 | 
50 |      java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace
51 | 
52 | try restarting `sbt` with more memory (e.g., `sbt -mem 2048`).
53 | 
54 | See `scalding-debugging.md` or maybe `../notes/` for more.
55 | 


--------------------------------------------------------------------------------
/sql/stats/2021-04-08_table_sizes.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ## SQL Table Sizes
 3 | 
 4 |     Size: 467.23G    
 5 | 
 6 |     SELECT
 7 |         table_name,
 8 |         pg_size_pretty(table_size) AS table_size,
 9 |         pg_size_pretty(indexes_size) AS indexes_size,
10 |         pg_size_pretty(total_size) AS total_size
11 |       FROM (
12 |           SELECT
13 |               table_name,
14 |               pg_table_size(table_name) AS table_size,
15 |               pg_indexes_size(table_name) AS indexes_size,
16 |               pg_total_relation_size(table_name) AS total_size
17 |           FROM (
18 |               SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
19 |               FROM information_schema.tables
20 |               WHERE table_schema = 'public'
21 |           ) AS all_tables
22 |           ORDER BY total_size DESC
23 |       ) AS pretty_sizes;
24 | 
25 |               table_name           | table_size | indexes_size | total_size
26 |     -------------------------------+------------+--------------+------------
27 |      "public"."cdx"                | 49 GB      | 26 GB        | 76 GB
28 |      "public"."grobid"             | 69 GB      | 6834 MB      | 75 GB
29 |      "public"."grobid_shadow"      | 67 GB      | 5455 MB      | 73 GB
30 |      "public"."ingest_request"     | 39 GB      | 32 GB        | 70 GB
31 |      "public"."ingest_file_result" | 32 GB      | 29 GB        | 60 GB
32 |      "public"."file_meta"          | 32 GB      | 21 GB        | 53 GB
33 |      "public"."pdf_meta"           | 18 GB      | 3733 MB      | 22 GB
34 |      "public"."fatcat_file"        | 12 GB      | 6602 MB      | 18 GB
35 |      "public"."shadow"             | 9517 MB    | 8026 MB      | 17 GB
36 |      "public"."html_meta"          | 1196 MB    | 8072 kB      | 1204 MB
37 |      "public"."petabox"            | 403 MB     | 461 MB       | 864 MB
38 |      "public"."pdftrio"            | 550 MB     | 297 MB       | 847 MB
39 |     (12 rows)
40 | 
41 | 


--------------------------------------------------------------------------------
/notes/tasks/2021-09-09_pdf_url_lists.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Want to dump a URL list to share with partners, filtered to content we think is
 3 | likely to be scholarly.
 4 | 
 5 | Columns to include:
 6 | 
 7 | - original URL
 8 | - capture timestamp
 9 | - SHA1
10 | 
11 | ## Stats Overview
12 | 
13 | file_meta table, mimetype=application/pdf: 173,816,433
14 | 
15 | cdx table, mimetype=application/pdf: 131,346,703
16 | 
17 | ingest_file_result table, pdf, success: 66,487,928
18 | 
19 | ## Ingested PDF URLs
20 | 
21 | "Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
22 | 
23 |     COPY (
24 |         SELECT
25 |             base_url as start_url,
26 |             terminal_url as pdf_url,
27 |             terminal_dt as pdf_url_timestamp,
28 |             terminal_sha1hex as pdf_sha1hex
29 |         FROM ingest_file_result
30 |         WHERE
31 |             ingest_type = 'pdf'
32 |             AND status = 'success'
33 |     )
34 |     TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
35 |     WITH NULL '';
36 |     => 77,892,849
37 | 
38 | ## CDX PDFs
39 | 
40 | "All web PDFs": CDX query; left join file_meta, but don't require
41 | 
42 |     COPY (
43 |         SELECT
44 |             cdx.url as pdf_url,
45 |             cdx.datetime as pdf_url_timestamp,
46 |             cdx.sha1hex as pdf_sha1hex
47 |         FROM cdx
48 |         LEFT JOIN file_meta
49 |         ON
50 |             cdx.sha1hex = file_meta.sha1hex
51 |         WHERE
52 |             file_meta.mimetype = 'application/pdf'
53 |             OR (
54 |                 file_meta.mimetype IS NULL
55 |                 AND cdx.mimetype = 'application/pdf'
56 |             )
57 |     )
58 |     TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
59 |     WITH NULL '';
60 |     => 147,837,935
61 | 
62 | ## Processed web PDFs
63 | 
64 | "Parsed web PDFs": `file_meta`, left join CDX
65 | 
66 | (didn't do this one)
67 | 
68 | ---
69 | 
70 | Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
71 | 


--------------------------------------------------------------------------------
/sql/dump_reingest_old.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |     SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |     LEFT JOIN ingest_file_result ON
 7 |         ingest_file_result.base_url = ingest_request.base_url
 8 |         AND ingest_file_result.ingest_type = ingest_request.ingest_type
 9 |     WHERE
10 |         ingest_file_result.hit = false
11 |         AND ingest_request.created < NOW() - '6 day'::INTERVAL
12 |         -- AND ingest_request.created > NOW() - '181 day'::INTERVAL
13 |         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
14 |              OR ingest_request.ingest_request_source = 'fatcat-ingest'
15 |              OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
16 |              OR ingest_request.ingest_request_source = 'unpaywall'
17 |              OR ingest_request.ingest_request_source = 'arxiv'
18 |              OR ingest_request.ingest_request_source = 'pmc'
19 |              OR ingest_request.ingest_request_source = 'doaj'
20 |              OR ingest_request.ingest_request_source = 'dblp')
21 |         AND (
22 |             ingest_file_result.status like 'spn2-%'
23 |             -- OR ingest_file_result.status like 'no-capture'
24 |             -- OR ingest_file_result.status like 'cdx-error'
25 |             -- OR ingest_file_result.status like 'petabox-error'
26 |         )
27 |         AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
28 |         AND ingest_file_result.status != 'spn2-error:filesize-limit'
29 |         AND ingest_file_result.status != 'spn2-error:not-found'
30 |         AND ingest_file_result.status != 'spn2-error:blocked-url'
31 |         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
32 |         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
33 |         AND ingest_file_result.status != 'spn2-error:unknown'
34 | ) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json';
35 | 
36 | ROLLBACK;
37 | 


--------------------------------------------------------------------------------
/sql/backfill/backfill_grobid_unpaywall.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | This is a "one-time" tranform helper script for GROBID backfill into
 4 | sandcrawler minio and postgresql.
 5 | 
 6 | This variant of backfill_grobid.py pushes into the unpaywall bucket of
 7 | sandcrawler-minio and doesn't push anything to sandcrawler table in general.
 8 | """
 9 | 
10 | import json, os, sys, collections, io
11 | import base64
12 | import requests
13 | from minio import Minio
14 | import psycopg2
15 | import psycopg2.extras
16 | 
17 | 
18 | def b32_hex(s):
19 |     s = s.strip().split()[0].lower()
20 |     if s.startswith("sha1:"):
21 |         s = s[5:]
22 |     if len(s) != 32:
23 |         return s
24 |     return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
25 | 
26 | def stdin_to_minio():
27 |     mc = Minio('localhost:9000',
28 |         access_key=os.environ['MINIO_ACCESS_KEY'],
29 |         secret_key=os.environ['MINIO_SECRET_KEY'],
30 |         secure=False)
31 |     counts = collections.Counter({'total': 0})
32 |     for l in sys.stdin:
33 |         if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
34 |             print("Progress: {}...".format(counts))
35 |         counts['raw_lines'] += 1
36 |         l = l.strip()
37 |         if not l:
38 |             continue
39 |         row = json.loads(l)
40 |         if not row:
41 |             continue
42 |         sha1hex = b32_hex(row['pdf_hash'])
43 |         grobid_xml = row['tei_xml'].encode('utf-8')
44 |         grobid_xml_len = len(grobid_xml)
45 |         grobid_xml = io.BytesIO(grobid_xml)
46 | 
47 |         key = "grobid/{}/{}/{}.tei.xml".format(
48 |             sha1hex[0:2],
49 |             sha1hex[2:4],
50 |             sha1hex)
51 |         mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len,
52 |             content_type="application/tei+xml",
53 |             metadata=None)
54 |         counts['minio-success'] += 1
55 | 
56 |     print("Done: {}".format(counts))
57 | 
58 | if __name__=='__main__':
59 |     stdin_to_minio()
60 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.pipe.Pipe
 4 | import com.twitter.scalding.Args
 5 | import com.twitter.scalding.Stat
 6 | import com.twitter.scalding.TypedPipe
 7 | import com.twitter.scalding.TypedTsv
 8 | import parallelai.spyglass.base.JobBase
 9 | 
10 | class GroupFatcatWorksSubsetJob(args: Args) extends JobBase(args) {
11 | 
12 |   val fatcatLhsRowCount = Stat("fatcat-rows-filtered-left", "sandcrawler")
13 |   val fatcatRhsRowCount = Stat("fatcat-rows-filtered-right", "sandcrawler")
14 |   val joinedRowCount = Stat("joined-rows", "sandcrawler")
15 | 
16 |   val fatcatScorableLhs : Scorable = new FatcatScorable()
17 |   val fatcatPipeLhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableLhs
18 |     .getInputPipe(args)
19 |     .map { r =>
20 |       fatcatLhsRowCount.inc
21 |       r
22 |     }
23 | 
24 |   val fatcatScorableRhs : Scorable = new FatcatScorableRight()
25 |   val fatcatPipeRhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableRhs
26 |     .getInputPipe(args)
27 |     .map { r =>
28 |       fatcatRhsRowCount.inc
29 |       r
30 |     }
31 | 
32 |   val joinedPipe = fatcatPipeLhs
33 |     .addTrap(TypedTsv(args("output") + ".trapped"))
34 |     .join(fatcatPipeRhs)
35 | 
36 |   // TypedTsv doesn't work over case classes.
37 |   joinedPipe
38 |     // filter out trivial self-matches (releases are identical)
39 |     .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
40 |       Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
41 |     }
42 |     .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
43 |       joinedRowCount.inc
44 |       new ReduceOutput(
45 |         slug,
46 |         Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
47 |         fatcatFeaturesLeft.json,
48 |         fatcatFeaturesRight.json)
49 |     }
50 |     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
51 |     .write(TypedTsv[(String, Int, String, String)](args("output")))
52 | }
53 | 


--------------------------------------------------------------------------------
/scalding/src/test/scala/example/SimpleHBaseSourceExampleTest.scala:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import org.junit.runner.RunWith
 4 | import com.twitter.scalding.{JobTest, TupleConversions}
 5 | import org.scalatest.FunSpec
 6 | import org.scalatest.junit.JUnitRunner
 7 | import org.slf4j.LoggerFactory
 8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 9 | import cascading.tuple.{Tuple, Fields}
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import scala._
12 | import com.twitter.scalding.Tsv
13 | import parallelai.spyglass.hbase.HBaseSource
14 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
15 | 
16 | /**
17 |  * Example of how to define tests for HBaseSource
18 |  */
19 | @RunWith(classOf[JUnitRunner])
20 | class SimpleHBaseSourceExampleTest extends FunSpec with TupleConversions {
21 | 
22 |   val output = "/tmp/testOutput"
23 | 
24 |   val log = LoggerFactory.getLogger(this.getClass.getName)
25 | 
26 |   val sampleData = List(
27 |     List("1", "kk1", "pp1"),
28 |     List("2", "kk2", "pp2"),
29 |     List("3", "kk3", "pp3")
30 |   )
31 | 
32 |   JobTest("example.SimpleHBaseSourceExample")
33 |     .arg("test", "")
34 |     .arg("app.conf.path", "app.conf")
35 |     .arg("output", output)
36 |     .arg("debug", "true")
37 |     .source[Tuple](
38 |     new HBaseSource(
39 |       "table_name",
40 |       "mtrcs-zk1.us.archive.org:2181",
41 |       new Fields("key"),
42 |       List("column_family"),
43 |       List(new Fields("column_name1", "column_name2")),
44 |       sourceMode = SourceMode.GET_LIST, keyList = List("1", "2", "3")),
45 |     sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*)))
46 |     .sink[Tuple](Tsv(output format "get_list")) {
47 |       outputBuffer =>
48 |         log.debug("Output => " + outputBuffer)
49 | 
50 |         it("should return the test data provided.") {
51 |           println("outputBuffer.size => " + outputBuffer.size)
52 |           assert(outputBuffer.size === 3)
53 |         }
54 |     }
55 |     .run
56 |     .finish
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/extra/nginx/sandcrawler-minio:
--------------------------------------------------------------------------------
 1 | 
 2 | server {
 3 |     listen 80;
 4 |     listen [::]:80;
 5 |     listen 443 ssl http2;
 6 |     listen [::]:443 ssl http2;
 7 |     server_name  sandcrawler-minio.fatcat.wiki minio.sandcrawler.org;
 8 | 
 9 |     ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem;
10 |     ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem;
11 | 
12 |     #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'";
13 |     add_header X-Frame-Options "SAMEORIGIN";       # 'always' if nginx > 1.7.5
14 |     add_header X-Content-Type-Options "nosniff";   # 'always' if nginx > 1.7.5
15 |     add_header X-Xss-Protection "1";
16 |     # Enable STS with one year period (breaks http; optional)
17 |     #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains";
18 | 
19 |     error_log   /var/log/nginx/sandcrawler-errors.log;
20 |     access_log  /dev/null;
21 | 
22 |     if ($scheme = http) {
23 |         return 301 https://$server_name$request_uri;
24 |     }
25 | 
26 |     location /minio/ {
27 | 
28 |         # allows all HTTP verbs
29 | 
30 |         proxy_pass http://localhost:9000;
31 |         proxy_redirect off;
32 | 
33 |         proxy_set_header  X-Real-IP  $remote_addr;
34 |         proxy_set_header  X-Forwarded-For $proxy_add_x_forwarded_for;
35 |         proxy_set_header  Host $http_host;
36 |     }
37 | 
38 |     location / {
39 |         if ($request_method !~ "GET") {
40 |             return 403;
41 |             break;
42 |         }
43 | 
44 |         proxy_pass http://localhost:9000;
45 |         proxy_redirect off;
46 | 
47 |         proxy_set_header  X-Real-IP  $remote_addr;
48 |         proxy_set_header  X-Forwarded-For $proxy_add_x_forwarded_for;
49 |         proxy_set_header  Host $http_host;
50 |     }
51 | 
52 |     # Let's Encrypt SSL Certs
53 |     location /.well-known/acme-challenge/ {
54 |         root /var/www/letsencrypt;
55 |         autoindex off;
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/notes/ingest/2020-02-18_ingest_backfills.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Select:
 3 | 
 4 |     COPY (
 5 |         SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |         LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
 7 |         WHERE ingest_request.ingest_type = 'pdf'
 8 |             AND ingest_file_result.ingest_type = 'pdf'
 9 |             AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
10 |             AND ingest_file_result.hit = false
11 |             AND ingest_file_result.status like 'spn2-error%'
12 |     ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json';
13 |     => COPY 6537
14 | 
15 |     COPY (
16 |         SELECT row_to_json(ingest_request.*) FROM ingest_request
17 |         LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
18 |         WHERE ingest_request.ingest_type = 'pdf'
19 |             AND ingest_file_result.ingest_type = 'pdf'
20 |             AND ingest_file_result.hit = false
21 |             AND ingest_file_result.status like 'wayback-error'
22 |     ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json';
23 |     => COPY 33022
24 | 
25 | Transform:
26 | 
27 |     ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json
28 |     ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json
29 | 
30 | Push to kafka:
31 | 
32 |     cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
33 |     cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
34 | 
35 | Many had null `ingest_request_source`, so won't actually import into fatcat:
36 | 
37 |     bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n
38 |           1 "savepapernow-web"
39 |         112 "fatcat-ingest-container"
40 |       11750 "fatcat-changelog"
41 |       21159 null
42 | 
43 | 


--------------------------------------------------------------------------------
/proposals/schema_changes.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- file_meta: more NOT NULL
 3 | CREATE TABLE IF NOT EXISTS file_meta (
 4 |     sha1hex             TEXT NOT NULL PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
 5 |     sha256hex           TEXT NOT NULL CHECK (octet_length(sha256hex) = 64),
 6 |     md5hex              TEXT NOT NULL CHECK (octet_length(md5hex) = 32),
 7 |     size_bytes          BIGINT NOT NULL,
 8 |     mimetype            TEXT CHECK (octet_length(mimetype) >= 1)
 9 | );
10 | 
11 | -- CDX: add domain/host columns?
12 | CREATE TABLE IF NOT EXISTS cdx (
13 |     url                 TEXT NOT NULL CHECK (octet_length(url) >= 1),
14 |     datetime            TEXT NOT NULL CHECK (octet_length(datetime) = 14),
15 |     sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
16 |     cdx_sha1hex         TEXT CHECK (octet_length(cdx_sha1hex) = 40),
17 |     mimetype            TEXT CHECK (octet_length(mimetype) >= 1),
18 |     warc_path           TEXT CHECK (octet_length(warc_path) >= 1),
19 |     warc_csize          BIGINT,
20 |     warc_offset         BIGINT,
21 |     row_created         TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
22 |     domain              TEXT NOT NULL CHECK (octet_length(domain) >= 1),
23 |     host                TEXT NOT NULL CHECK (octet_length(host) >= 1),
24 |     PRIMARY KEY(url, datetime)
25 | );
26 | CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
27 | CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
28 | 
29 | -- direct fast import with just md5hex; big UPDATE via join with file_meta
30 | CREATE TABLE IF NOT EXISTS shadow (
31 |     shadow_corpus       TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
32 |     shadow_id           TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
33 |     sha1hex             TEXT CHECK (octet_length(sha1hex) = 40),
34 |     md5hex              TEXT CHECK (octet_length(md5hex) = 32),
35 |     doi                 TEXT CHECK (octet_length(doi) >= 1),
36 |     pmid                TEXT CHECK (octet_length(pmid) >= 1),
37 |     isbn13              TEXT CHECK (octet_length(isbn13) >= 1),
38 |     PRIMARY KEY(shadow_corpus, shadow_id)
39 | );
40 | CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
41 | 


--------------------------------------------------------------------------------
/notes/ingest/2022-07-19_dblp.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Cross-posting from fatcat bulk metadata update/ingest.
 3 | 
 4 |     zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
 5 |     # 631k 0:00:11 [54.0k/s]
 6 | 
 7 | 
 8 | ## Post-Crawl Stats
 9 | 
10 | This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
11 | 2022-09-06:
12 | 
13 | 
14 |     SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
15 |     FROM ingest_request
16 |     LEFT JOIN ingest_file_result
17 |         ON ingest_file_result.ingest_type = ingest_request.ingest_type
18 |         AND ingest_file_result.base_url = ingest_request.base_url
19 |     WHERE 
20 |         ingest_request.link_source = 'dblp'
21 |     GROUP BY ingest_request.ingest_type, status
22 |     -- ORDER BY ingest_request.ingest_type, COUNT DESC
23 |     ORDER BY COUNT DESC
24 |     LIMIT 30;
25 | 
26 | 
27 |      ingest_type |        status         | count  
28 |     -------------+-----------------------+--------
29 |      pdf         | success               | 305142
30 |      pdf         | no-pdf-link           | 192683
31 |      pdf         | no-capture            |  42634
32 |      pdf         | terminal-bad-status   |  38041
33 |      pdf         | skip-url-blocklist    |  31055
34 |      pdf         | link-loop             |   9263
35 |      pdf         | wrong-mimetype        |   4545
36 |      pdf         | redirect-loop         |   3952
37 |      pdf         | empty-blob            |   2705
38 |      pdf         | wayback-content-error |    834
39 |      pdf         | wayback-error         |    294
40 |      pdf         | petabox-error         |    202
41 |      pdf         | blocked-cookie        |    155
42 |      pdf         | cdx-error             |    115
43 |      pdf         | body-too-large        |     66
44 |      pdf         | bad-redirect          |     19
45 |      pdf         | timeout               |      7
46 |      pdf         | bad-gzip-encoding     |      4
47 |     (18 rows)
48 | 
49 | That is quite a lot of `no-pdf-link`, might be worth doing a random sample
50 | and/or re-ingest. And a chunk of `no-capture` to retry.
51 | 


--------------------------------------------------------------------------------
/notes/petabox_ia_metadata.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Ran in aitio:/schnell/iamine-journals in December 2018.
 3 | 
 4 | Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018
 5 | 
 6 | Commands:
 7 | 
 8 |     # didn't work!
 9 |     #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist   
10 | 
11 |     # fetched manually via metamgr, using prefix matches
12 |     cat metamgr-* > metamgr-journals-loose.20181218.items
13 | 
14 |     ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json
15 | 
16 |     export LC_ALL=C
17 |     cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv
18 | 
19 | Size/results:
20 | 
21 |     bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items
22 |     2043877 journals-ia.20181218.json
23 |     2044362 metamgr-journals-loose.20181218.items
24 | 
25 |     # missed about 500; meh
26 | 
27 |     -rw-rw-r--  1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json
28 | 
29 |     bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 
30 |     1748645 journals-ia.20181218.pdf-sha1.tsv
31 | 
32 | ## June 2019 Ingest
33 | 
34 |     bnewbold@ia601101$ pwd
35 |     /schnell/iamine-journals
36 | 
37 |     zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json
38 |     zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json
39 |     zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json
40 |     zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json
41 | 
42 |     cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json
43 |     cat jstor.json | ./ia_pdf_match.py > jstor.match.json
44 |     cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json
45 |     cat pmc.json | ./ia_pdf_match.py > pmc.match.json
46 | 
47 |     bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json 
48 |         1076012 arxiv.json
49 |          740970 arxiv.match.json
50 |          451204 jstor.json
51 |          451204 jstor.match.json
52 |           77838 paper-doi.json
53 |           23736 paper-doi.match.json
54 |          209787 pmc.json
55 |          189093 pmc.match.json
56 | 
57 | 


--------------------------------------------------------------------------------
/python/sandcrawler/fileset_types.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | class IngestStrategy(str, Enum):
 8 |     WebFile = "web-file"
 9 |     WebFileset = "web-fileset"
10 |     WebFilesetBundled = "web-fileset-bundled"
11 |     ArchiveorgFile = "archiveorg-file"
12 |     ArchiveorgFileset = "archiveorg-fileset"
13 |     ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
14 | 
15 | 
16 | class FilesetManifestFile(BaseModel):
17 |     path: str
18 |     size: Optional[int]
19 |     md5: Optional[str]
20 |     sha1: Optional[str]
21 |     sha256: Optional[str]
22 |     mimetype: Optional[str]
23 |     extra: Optional[Dict[str, Any]]
24 | 
25 |     status: Optional[str]
26 |     platform_url: Optional[str]
27 |     terminal_url: Optional[str]
28 |     terminal_dt: Optional[str]
29 | 
30 | 
31 | class FilesetPlatformItem(BaseModel):
32 |     platform_name: str
33 |     platform_status: str
34 |     platform_domain: Optional[str]
35 |     platform_id: Optional[str]
36 |     manifest: Optional[List[FilesetManifestFile]]
37 | 
38 |     archiveorg_item_name: Optional[str]
39 |     archiveorg_item_meta: Optional[dict]
40 |     web_base_url: Optional[str]
41 |     web_bundle_url: Optional[str]
42 | 
43 | 
44 | class ArchiveStrategyResult(BaseModel):
45 |     ingest_strategy: str
46 |     status: str
47 |     manifest: List[FilesetManifestFile]
48 |     file_file_meta: Optional[Dict[str, Any]]
49 |     file_resource: Optional[Any]
50 |     bundle_file_meta: Optional[Dict[str, Any]]
51 |     bundle_resource: Optional[Any]
52 |     bundle_archiveorg_path: Optional[str]
53 | 
54 | 
55 | class PlatformScopeError(Exception):
56 |     """
57 |     For incidents where platform helper discovers that the fileset/dataset is
58 |     out-of-cope after already starting to process it.
59 | 
60 |     For example, attempting to ingest:
61 | 
62 |     - a 'latest version' record, when the platform has version-specific records
63 |     - a single file within a dataset for a platform which has file-level identifiers
64 |     """
65 | 
66 |     pass
67 | 
68 | 
69 | class PlatformRestrictedError(Exception):
70 |     """
71 |     When datasets are not publicly available on a platform (yet)
72 |     """
73 | 
74 |     pass
75 | 


--------------------------------------------------------------------------------
/sql/dump_reingest_weekly.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
 3 | 
 4 | COPY (
 5 |     SELECT row_to_json(ingest_request.*) FROM ingest_request
 6 |     LEFT JOIN ingest_file_result ON
 7 |         ingest_file_result.base_url = ingest_request.base_url
 8 |         AND ingest_file_result.ingest_type = ingest_request.ingest_type
 9 |     WHERE
10 |         (ingest_request.ingest_type = 'pdf'
11 |             OR ingest_request.ingest_type = 'html'
12 |             OR ingest_request.ingest_type = 'xml'
13 |             OR ingest_request.ingest_type = 'component')
14 |         AND ingest_file_result.hit = false
15 |         AND ingest_request.created < NOW() - '8 hour'::INTERVAL
16 |         AND ingest_request.created > NOW() - '8 day'::INTERVAL
17 |         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
18 |              OR ingest_request.ingest_request_source = 'fatcat-ingest'
19 |              OR ingest_request.ingest_request_source = 'fatcat-ingest-container')
20 |         AND (
21 |             ingest_file_result.status like 'spn2-%'
22 |             -- OR ingest_file_result.status = 'cdx-error'
23 |             -- OR ingest_file_result.status = 'wayback-error'
24 |             -- OR ingest_file_result.status = 'wayback-content-error'
25 |             OR ingest_file_result.status = 'petabox-error'
26 |             -- OR ingest_file_result.status = 'gateway-timeout'
27 |             OR ingest_file_result.status = 'no-capture'
28 |         )
29 |         AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
30 |         AND ingest_file_result.status != 'spn2-error:filesize-limit'
31 |         AND ingest_file_result.status != 'spn2-error:not-found'
32 |         AND ingest_file_result.status != 'spn2-error:blocked-url'
33 |         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
34 |         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
35 |         AND ingest_file_result.status != 'spn2-error:unknown'
36 | ) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json';
37 | 
38 | -- bulk re-tries would be:
39 | --      AND (ingest_request.ingest_request_source != 'fatcat-changelog'
40 | --           AND ingest_request.ingest_request_source != 'fatcat-ingest')
41 | 
42 | ROLLBACK;
43 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import parallelai.spyglass.base.JobBase
10 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
11 | import parallelai.spyglass.hbase.HBasePipeConversions
12 | import parallelai.spyglass.hbase.HBaseSource
13 | 
14 | // This nasty, no-good, horrible Job outputs a list of keys ("sha1:A234...")
15 | // for which the given "column" does not have a value set.
16 | // It does this using a self-join because SpyGlass's HBase SCAN support seems
17 | // to be extremely limited.
18 | class MissingColumnDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions {
19 | 
20 |   val output = args("output")
21 | 
22 |   val allKeys : TypedPipe[String] = MissingColumnDumpJob.getHBaseKeySource(
23 |     args("hbase-table"),
24 |     args("zookeeper-hosts"))
25 |     .read
26 |     .fromBytesWritable('key)
27 |     .toTypedPipe[String]('key)
28 | 
29 |   val existingKeys : TypedPipe[(String,Boolean)] = MissingColumnDumpJob.getHBaseColSource(
30 |     args("hbase-table"),
31 |     args("zookeeper-hosts"),
32 |     args("column"))
33 |     .read
34 |     .fromBytesWritable('key)
35 |     .toTypedPipe[String]('key)
36 |     .map{ key => (key, true) }
37 | 
38 |   val missingKeys : TypedPipe[String] = allKeys
39 |     .groupBy( identity )
40 |     .leftJoin(existingKeys.groupBy(_._1))
41 |     .toTypedPipe
42 |     .collect { case (key, (_, None)) => key }
43 | 
44 |   missingKeys
45 |     .write(TypedTsv[String](output))
46 | 
47 | }
48 | 
49 | object MissingColumnDumpJob {
50 | 
51 |   // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
52 |   def getHBaseColSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
53 |     HBaseBuilder.build(
54 |       hbaseTable,
55 |       zookeeperHosts,
56 |       List(col),
57 |       SourceMode.SCAN_ALL)
58 |   }
59 | 
60 |   def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
61 |     HBaseBuilder.build(
62 |       hbaseTable,
63 |       zookeeperHosts,
64 |       List("f:c"),
65 |       SourceMode.SCAN_ALL)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/sql/pdftrio_queries.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Counts / Status
 3 | 
 4 |     SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;
 5 | 
 6 |     # NOTE: I earlier deleted a large fraction of non-200 status codes, so
 7 |     # these aren't representative
 8 |      status_code |  count  
 9 |     -------------+---------
10 |               -4 |      16
11 |               -2 |      26
12 |              200 | 1117501
13 |              400 |    2695
14 |     (4 rows)
15 | 
16 | 
17 |     SELECT status, COUNT(*) FROM pdftrio GROUP BY status;
18 | 
19 |         status     |  count  
20 |     ---------------+---------
21 |      error         |    2696
22 |      error-connect |      26
23 |      error-timeout |      16
24 |      success       | 1118252
25 |     (4 rows)
26 | 
27 |     SELECT
28 |         COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
29 |         COUNT(CASE WHEN linear_score   IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
30 |         COUNT(CASE WHEN bert_score     IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
31 |         COUNT(CASE WHEN image_score    IS NOT NULL THEN 1 ELSE NULL END) as image_count
32 |     FROM pdftrio;
33 | 
34 | 
35 |      ensemble_count | linear_count | bert_count | image_count 
36 |     ----------------+--------------+------------+-------------
37 |             1120100 |       976271 |      66209 |      143829
38 |     (1 row)
39 | 
40 | ## Histograms
41 | 
42 |     SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
43 |     WHERE status = 'success'
44 |         AND ensemble_score IS NOT NULL
45 |     GROUP BY buckets
46 |     ORDER BY buckets;
47 | 
48 |     SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
49 |     WHERE status = 'success'
50 |         AND bert_score IS NOT NULL
51 |     GROUP BY buckets
52 |     ORDER BY buckets;
53 | 
54 |     SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
55 |     WHERE status = 'success'
56 |         AND linear_score IS NOT NULL
57 |     GROUP BY buckets
58 |     ORDER BY buckets;
59 | 
60 |     SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
61 |     WHERE status = 'success'
62 |         AND image_score IS NOT NULL
63 |     GROUP BY buckets
64 |     ORDER BY buckets;
65 | 
66 | 


--------------------------------------------------------------------------------
/extra/RUNBOOK.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Process Un-GROBID-ed PDFs from Wayback
 3 | 
 4 | Sometimes ingest doesn't pick up everything, or we do some heuristic CDX
 5 | import, and we want to run GROBID over all the PDFs that haven't been processed
 6 | yet. Only want one CDX line per `sha1hex`.
 7 | 
 8 | A hybrid SQL/UNIX way of generating processing list:
 9 | 
10 |     psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json
11 | 
12 | From here, there are two options: enqueue in Kafka and let workers run, or
13 | create job files and run them using local worker and GNU/parallel.
14 | 
15 | #### Kafka
16 | 
17 | Copy/transfer to a Kafka node; load a sample and then the whole output:
18 | 
19 |     head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
20 |     cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
21 | 
22 | #### Local JSON
23 | 
24 | Older example; if this fails, need to re-run entire thing:
25 | 
26 |     cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
27 | 
28 | TODO: is it possible to use job log with millions of `--pipe` inputs? That
29 | would be more efficient in the event of failure.
30 | 
31 | ## GROBID over many .zip files
32 | 
33 | Want to use GNU/Parallel in a mode that will do retries well:
34 | 
35 |     fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
36 |         sort | \
37 |         parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
38 |         './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
39 | 
40 | After starting, check that messages are actually getting pushed to kafka
41 | (producer failures can be silent!). If anything goes wrong, run the exact same
42 | command again. The sort is to ensure jobs are enqueued in the same order again;
43 | could also dump `fd` output to a command file first.
44 | 
45 | 


--------------------------------------------------------------------------------
/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import cascading.tuple.Fields
 4 | import cascading.tuple.Tuple
 5 | import com.twitter.scalding.JobTest
 6 | import com.twitter.scalding.Tsv
 7 | import com.twitter.scalding.TupleConversions
 8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 9 | import org.apache.hadoop.hbase.util.Bytes
10 | import org.junit.runner.RunWith
11 | import org.scalatest.FunSpec
12 | import org.scalatest.junit.JUnitRunner
13 | import org.slf4j.LoggerFactory
14 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
15 | import parallelai.spyglass.hbase.HBaseSource
16 | import scala._
17 | 
18 | /**
19 |  * Example of how to define tests for HBaseSource
20 |  */
21 | @RunWith(classOf[JUnitRunner])
22 | class HBaseRowCountTest extends FunSpec with TupleConversions {
23 | 
24 |   val output = "/tmp/testOutput"
25 |   val (testTable, testHost) = ("test-table", "dummy-host:2181")
26 | 
27 |   val log = LoggerFactory.getLogger(this.getClass.getName)
28 | 
29 |   val sampleData = List(
30 |     List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "a", "b"),
31 |     List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b"),
32 |     List("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "a", "b"),
33 |     List("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "a", "b"),
34 |     List("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", "a", "b"),
35 |     List("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", "a", "b"),
36 |     List("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", "a", "b"),
37 |     List("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", "a", "b")
38 |   )
39 | 
40 |   JobTest("sandcrawler.HBaseRowCountJob")
41 |     .arg("test", "")
42 |     .arg("app.conf.path", "app.conf")
43 |     .arg("output", output)
44 |     .arg("hbase-table", testTable)
45 |     .arg("zookeeper-hosts", testHost)
46 |     .arg("debug", "true")
47 |     .source[Tuple](HBaseRowCountJob.getHBaseSource(testTable, testHost),
48 |       sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*)))
49 |       .sink[Tuple](Tsv(output)) {
50 |       outputBuffer =>
51 | 
52 |         it("should return the test data provided.") {
53 |           assert(outputBuffer.size === 1)
54 |         }
55 | 
56 |         it("should return the correct count") {
57 |           assert(outputBuffer(0).getObject(0) === 8)
58 |         }
59 |     }
60 |     .run
61 |     .finish
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import cascading.property.AppProps
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import parallelai.spyglass.base.JobBase
10 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
11 | import parallelai.spyglass.hbase.HBasePipeConversions
12 | import parallelai.spyglass.hbase.HBaseSource
13 | 
14 | // Filters for HBase rows which have not had GROBID run on them, but do have
15 | // full CDX metadata, and dumps to a TSV for later extraction by the
16 | // "extraction-ungrobided" job.
17 | //
18 | // Does the same horrible join thing that DumpUnGrobidedJob does.
19 | class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions {
20 | 
21 |   val output = args("output")
22 | 
23 |   val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource(
24 |     args("hbase-table"),
25 |     args("zookeeper-hosts"))
26 |     .read
27 |     .fromBytesWritable('key, 'c, 'mime, 'cdx)
28 |     .toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx)
29 | 
30 |   val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource(
31 |     args("hbase-table"),
32 |     args("zookeeper-hosts"))
33 |     .read
34 |     .fromBytesWritable('key)
35 |     .toTypedPipe[String]('key)
36 |     .map{ key => (key, true) }
37 | 
38 |   val missingKeys : TypedPipe[(String,String,String,String)] = allKeys
39 |     .groupBy(_._1)
40 |     .leftJoin(existingKeys.groupBy(_._1))
41 |     .toTypedPipe
42 |     .collect { case (key, ((_, c, mime, cdx), None)) => (key, c, mime, cdx) }
43 | 
44 |   missingKeys
45 |     .write(TypedTsv[(String,String,String,String)](output))
46 | 
47 | }
48 | 
49 | object DumpUnGrobidedJob {
50 | 
51 |   // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
52 |   def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
53 |     HBaseBuilder.build(
54 |       hbaseTable,
55 |       zookeeperHosts,
56 |       List("grobid0:status_code"),
57 |       SourceMode.SCAN_ALL)
58 |   }
59 | 
60 |   def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
61 |     HBaseBuilder.build(
62 |       hbaseTable,
63 |       zookeeperHosts,
64 |       List("f:c", "file:mime", "file:cdx"),
65 |       SourceMode.SCAN_ALL)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | package sandcrawler
 3 | 
 4 | import cascading.flow.FlowDef
 5 | import cascading.pipe.Pipe
 6 | import cascading.tuple.Fields
 7 | import com.twitter.scalding._
 8 | import com.twitter.scalding.typed.TDsl._
 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.util.Bytes
11 | import parallelai.spyglass.base.JobBase
12 | import parallelai.spyglass.hbase.HBaseConstants.SourceMode
13 | import parallelai.spyglass.hbase.HBasePipeConversions
14 | import parallelai.spyglass.hbase.HBaseSource
15 | 
16 | class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
17 | 
18 |   val grobidHbaseRows = Stat("hbase-rows-scanned", "hbase-grobid-dump")
19 |   val filteredGrobidRows = Stat("grobid-rows-filtered", "hbase-grobid-dump")
20 |   val parsedGrobidRows = Stat("grobid-rows-parsed", "hbase-grobid-dump")
21 |   val validGrobidRows = Stat("grobid-rows-valid-slug", "hbase-grobid-dump")
22 | 
23 |   val pipe = GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
24 |     .read
25 |     // Can't just "fromBytesWritable" because we have multiple types?
26 |     .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code"))
27 |     .filter { case (_, metadata, status_code) =>
28 |       grobidHbaseRows.inc
29 |       metadata != null && status_code != null
30 |     }
31 |     .map { case (key, metadata, status_code) =>
32 |       (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes()))
33 |     }
34 |     // TODO: Should I combine next two stages for efficiency?
35 |     .collect { case (key, json, 200) =>
36 |       filteredGrobidRows.inc
37 |       (key, json)
38 |     }
39 |     .map { entry : (String, String) =>
40 |       parsedGrobidRows.inc
41 |       GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
42 |     }
43 |     .filterNot { entry => entry.isEmpty }
44 |     .map { entry => {
45 |       validGrobidRows.inc
46 |       entry.get
47 |     }}
48 |     .groupBy { case MapFeatures(slug, json) => slug }
49 |     .map { tuple =>
50 |       val (slug : String, features : MapFeatures) = tuple
51 |       (slug, ReduceFeatures(features.json))
52 |     }
53 | 
54 |   pipe
55 |     .map { case (slug, features) =>
56 |       (slug, features.json)
57 |     }
58 |     .write(TypedTsv[(String, String)](args("output")))
59 | }
60 | 


--------------------------------------------------------------------------------
/kafka/debugging_issues.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 2020-11-12
 3 | 
 4 | To reset a consumer group to the offsets from a specific date (or datetime),
 5 | use:
 6 | 
 7 |     ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-grobid-s3 --reset-offsets --all-topics --to-datetime 2020-11-09T00:00:00.000
 8 | 
 9 | Add `--execute` to actually commit the change.
10 | 
11 | ## 2018-12-02
12 | 
13 | Had been having some troubles with consumer group partition assignments with
14 | the grobid-output group and grobid-hbase-insert consumer group. Tried deleting
15 | and re-creating, which was probbaly a mistake. Also tried to use kafka-broker
16 | shell scripts to cleanup/debug and didn't work well.
17 | 
18 | In the end, after re-building the topic, decided to create a new consumer group
19 | (grobid-hbase-insert2) to get rid of history/crap. Might need to do this again
20 | in the future, oh well.
21 | 
22 | A few things learned:
23 | 
24 | - whatever pykafka "native python" is producing to consumer group offsets
25 |   doesn't work great with kafka-manager or the shell scripts: consumer instance
26 |   names don't show. this is an error in shell scripts, and blank/red in
27 |   kafka-manager
28 | - restarting kafka-manager takes a while (for it to refresh data?) and it shows
29 |   inconsistent stuff during that period, but it does result in cleaned up
30 |   consumer group cached metadata (aka, old groups are cleared)
31 | - kafka-manager can't fetch JXM info, either due to lack of config or port
32 |   blocking. should try to fix this for metrics etc
33 | - it would be nice to be using recent librdkafka everywhere. pykafka can
34 |   optionally use this, and many other tools do automatically. however, this is
35 |   a system package, and xenial doesn't have backports (debian stretch does).
36 |   the version in bionic looks "good enough", so many should try that?
37 | - there has been a minor release of kafka (2.1) since I installed (!)
38 | - the burrow (consumer group monitoring) tool is packaged for some version of
39 |   ubuntu
40 | 
41 | In general, not feally great about the current setup. Very frustrating that the
42 | debug/status tools are broken with pykafka native output. Need to at least
43 | document things a lot better.
44 | 
45 | Separately, came up with an idea to do batched processing with GROBID: don't
46 | auto-commit, instead consume a batch (10? or until block), process those, then
47 | commit. This being a way to get "the batch size returned".
48 | 
49 | 


--------------------------------------------------------------------------------
/notes/tasks/2020-08-20_file_meta.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Want to update fatcat file entities with "full" file metadata for those which are missing it.
 3 | 
 4 | How many `file_meta` rows *still* don't have metadata?
 5 | 
 6 |     SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
 7 |     => 62962
 8 | 
 9 | First generate list of sha1hex from most recent bulk export which are missing
10 | at least some metadata (based on missing sha256):
11 | 
12 |     zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
13 |     => 18.7M 0:05:46 [53.8k/s]
14 | 
15 | Then dump the entire sandcrawler `file_meta` table as TSV, with first column
16 | sha1hex and second column JSON with all the file metadata fields:
17 | 
18 |     COPY (
19 |       SELECT sha1hex, row_to_json(file_meta)
20 |       FROM file_meta
21 |       WHERE sha256hex IS NOT NULL
22 |       ORDER BY sha1hex ASC
23 |     )
24 |     TO '/grande/snapshots/file_meta_dump.tsv'
25 |     WITH NULL '';
26 | 
27 | Join/cut:
28 | 
29 |     export LC_ALL=C
30 |     join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
31 |     => 18.1M 0:03:37 [83.2k/s]
32 | 
33 | Check counts:
34 | 
35 |     cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
36 |     => 18135313
37 | 
38 |     zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
39 |     18103860 application/pdf
40 |       29977 application/octet-stream
41 |         876 text/html
42 |         199 application/postscript
43 |         171 application/gzip
44 |          84 text/plain
45 |          48 application/xml
46 |          38 application/vnd.ms-powerpoint
47 |          16 application/msword
48 |           8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
49 |           6 image/jpeg
50 |           4 message/rfc822
51 |           4 application/zip
52 |           4 application/vnd.openxmlformats-officedocument.presentationml.presentation
53 |           3 text/x-tex
54 |           3 application/x-dosexec
55 |           2 application/x-tar
56 |           2 application/vnd.ms-tnef
57 |           1 video/mpeg
58 |           1 image/tiff
59 |           1 image/svg+xml
60 |           1 image/png
61 |           1 image/gif
62 |           1 audio/x-ape
63 |           1 application/vnd.ms-office
64 |           1 application/CDFV2-unknown
65 | 
66 | TODO: fatcat importer
67 | 


--------------------------------------------------------------------------------
/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala:
--------------------------------------------------------------------------------
 1 | package sandcrawler
 2 | 
 3 | import java.io.InputStream
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | import org.scalatest._
 8 | 
 9 | // scalastyle:off null
10 | class ScorableFeaturesTest extends FlatSpec with Matchers {
11 |   "toMapFeatures()" should "work with gnarly inputs" in {
12 |     ScorableFeatures.create(title = null).toMapFeatures
13 |     ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
14 |   }
15 | 
16 |   private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
17 | 
18 |   "mapToSlug()" should "extract the parts of titles before a colon" in {
19 |     titleToSlug("HELLO:there") shouldBe Some("hellothere")
20 |   }
21 | 
22 |   it should "extract an entire colon-less string" in {
23 |     titleToSlug("hello THERE") shouldBe Some("hellothere")
24 |   }
25 | 
26 |   it should "return Scorable.NoSlug if given empty string" in {
27 |     titleToSlug("") shouldBe (None)
28 |   }
29 | 
30 |   it should "return Scorable.NoSlug if given null" in {
31 |     titleToSlug(null) shouldBe (None)
32 |   }
33 | 
34 |   it should "strip punctuation" in {
35 |     titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
36 |     titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
37 |     titleToSlug(
38 |       "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
39 |     titleToSlug(":;\"\'") shouldBe (None)
40 |   }
41 | 
42 |   it should "filter stub titles" in {
43 |     titleToSlug("abstract") shouldBe (None)
44 |     titleToSlug("title!") shouldBe (None)
45 |     titleToSlug("a real title which is not on denylist") shouldBe Some("arealtitlewhichisnotondenylist")
46 |   }
47 | 
48 |   it should "strip special characters" in {
49 |     titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None)
50 |     // TODO: titleToSlug("©™₨№…") shouldBe (None)
51 |     // TODO: titleToSlug("πµΣσ") shouldBe (None)
52 |   }
53 | 
54 |   it should "remove whitespace" in {
55 |     titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
56 |     titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
57 |     titleToSlug("\n \t \r  ") shouldBe (None)
58 |   }
59 | 
60 |   it should "skip very short slugs" in {
61 |     titleToSlug("short") shouldBe (None)
62 |     titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/python/tests/test_pdfextract.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | import poppler
 4 | import pytest
 5 | from test_wayback import cdx_client, wayback_client  # noqa:F401
 6 | 
 7 | from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
 8 | from sandcrawler.pdfextract import process_pdf
 9 | 
10 | FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
11 | 
12 | 
13 | def test_process_fake_pdf():
14 |     resp = process_pdf(FAKE_PDF_BYTES)
15 |     print(resp)
16 |     assert resp.status == "not-pdf"
17 | 
18 |     with open("tests/files/dummy_zip.zip", "rb") as f:
19 |         pdf_bytes = f.read()
20 |     resp = process_pdf(pdf_bytes)
21 |     assert resp.status == "not-pdf"
22 | 
23 | 
24 | @pytest.mark.skipif(
25 |     poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
26 | )
27 | def test_process_dummy_pdf():
28 |     with open("tests/files/dummy.pdf", "rb") as f:
29 |         pdf_bytes = f.read()
30 |     resp = process_pdf(pdf_bytes)
31 |     assert resp.status == "success"
32 |     assert resp.page0_thumbnail is not None
33 |     assert len(resp.text) > 10
34 |     assert resp.meta_xml is None
35 |     assert resp.file_meta["mimetype"] == "application/pdf"
36 |     print(resp.pdf_info)
37 |     print(resp.pdf_extra)
38 |     assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
39 |     # 595 x 842
40 |     assert resp.pdf_extra["page0_height"] == 842
41 |     assert resp.pdf_extra["page0_width"] == 595
42 |     assert resp.pdf_extra["page_count"] == 1
43 | 
44 | 
45 | def test_pdfextract_worker_cdx(wayback_client):  # noqa: F811
46 | 
47 |     sink = BlackholeSink()
48 |     worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
49 | 
50 |     with open("tests/files/example.cdx", "r") as cdx_file:
51 |         pusher = CdxLinePusher(
52 |             worker,
53 |             cdx_file,
54 |             filter_http_statuses=[200, 226],
55 |             filter_mimetypes=["application/pdf"],
56 |         )
57 |         pusher_counts = pusher.run()
58 |         assert pusher_counts["total"]
59 |         assert pusher_counts["pushed"] == 7
60 |         assert pusher_counts["pushed"] == worker.counts["total"]
61 | 
62 | 
63 | def test_pdfextract_blob_worker():
64 | 
65 |     sink = BlackholeSink()
66 |     worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
67 | 
68 |     with open("tests/files/dummy.pdf", "rb") as f:
69 |         pdf_bytes = f.read()
70 | 
71 |     worker.process(pdf_bytes)
72 | 


--------------------------------------------------------------------------------
/python/scripts/covid2ingestrequest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Transform an unpaywall dump (JSON) into ingest requests.
 4 | """
 5 | 
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | import urlcanon
11 | 
12 | 
13 | def canon(s):
14 |     parsed = urlcanon.parse_url(s)
15 |     return str(urlcanon.whatwg(parsed))
16 | 
17 | 
18 | def transform_cnki(obj):
19 | 
20 |     requests = []
21 |     assert obj["cnki_id"]
22 | 
23 |     requests = []
24 |     requests.append(
25 |         {
26 |             "base_url": canon(obj["info_url"]),
27 |             "ingest_type": "pdf",
28 |             "link_source": "cnki_covid19",
29 |             "link_source_id": obj["cnki_id"],
30 |             "ingest_request_source": "scrape-covid19",
31 |         }
32 |     )
33 |     if "read_url" in obj:
34 |         requests.append(
35 |             {
36 |                 "base_url": canon(obj["read_url"]),
37 |                 "ingest_type": "pdf",  # actually HTML
38 |                 "link_source": "cnki_covid19",
39 |                 "link_source_id": obj["cnki_id"],
40 |                 "ingest_request_source": "scrape-covid19",
41 |             }
42 |         )
43 | 
44 |     return requests
45 | 
46 | 
47 | def transform_wanfang(obj):
48 | 
49 |     assert obj["wanfang_id"]
50 |     return [
51 |         {
52 |             "base_url": canon(obj["url"]),
53 |             "ingest_type": "pdf",
54 |             "link_source": "wanfang_covid19",
55 |             "link_source_id": obj["wanfang_id"],
56 |             "ingest_request_source": "scrape-covid19",
57 |         }
58 |     ]
59 | 
60 | 
61 | def run(args):
62 |     for l in args.json_file:
63 |         if not l.strip():
64 |             continue
65 |         row = json.loads(l)
66 | 
67 |         if "wanfang_id" in row:
68 |             requests = transform_wanfang(row) or []
69 |         elif "cnki_id" in row:
70 |             requests = transform_cnki(row) or []
71 |         else:
72 |             continue
73 |         for r in requests:
74 |             print("{}".format(json.dumps(r, sort_keys=True)))
75 | 
76 | 
77 | def main():
78 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
79 |     parser.add_argument(
80 |         "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
81 |     )
82 |     subparsers = parser.add_subparsers()
83 | 
84 |     args = parser.parse_args()
85 | 
86 |     run(args)
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
 1 | 
 2 | Note: as of 2022 this file is ancient and need review
 3 |  
 4 | ## Kafka Pipelines
 5 | 
 6 | - after network split, mass restarting import/harvest stuff seemed to
 7 |   completely reset consumergroups (!). bunch of LeaderNotFoundError
 8 |     => change/update consumer group config
 9 |     => ensure we are recording timestamps to allow timestamp-based resets
10 | - refactor python kafka clients (slack convo with kenji+dvd)
11 |     => try librdkafka?
12 |     => switch to python-kafka?
13 | - monitoring/alerting of consumergroup offsets
14 |     => start with crude python script?
15 | - document: need to restart all consumers after brokers restart
16 | - operate on batches, using threads/async, and reduce worker (process) counts
17 |   dramatically
18 | 
19 | source of kafka-manager weirdness?
20 |     Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: org.apache.kafka.common.protocol.types.SchemaException: Error reading field 'user_data': java.nio.BufferUnderflowException
21 |     Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: [error] k.m.a.c.KafkaManagedOffsetCache - Failed to get member metadata from group summary and member summary : grobid-hbase-insert : MemberSummary(pykafka-8128e0be-4952-4e79-8644-a52987421259,pykafka,/207.241.225.228,[B@6c368f37,[B@2b007e01)
22 | 
23 | ## Other
24 | 
25 | - paper match heuristic: include 10.1007%2F978-3-319-49304-6_18 (URL-escaped slash)
26 | - catch EOFFail fetching from wayback
27 | - "author counts match" in scoring
28 | - refactor "scorable" to "matchable"
29 | - look at refactoring to reduce JSON serializations
30 | - QA tool for matches (PDF + Crossref JSON + landing page?)
31 |     => python; talks directly to HBase
32 | - author counts should match (+/- one?)
33 | 
34 | match strategies (hbase columns):
35 | - legacy_doi
36 | - url_doi
37 | - grobid_crossref (doi)
38 | - grobid_fatcat (fatcat ID)
39 | 
40 | scalding:
41 | - better JSON library
42 | - less verbose sbt test output (set log level to WARN)
43 | - auto-formatting: addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.6.0-RC3")
44 | 
45 | pig:
46 | - potentially want to *not* de-dupe CDX lines by uniq sha1 in all cases; run
47 |   this as a second-stage filter? for example, may want many URL links in fatcat
48 |   for a single file (different links, different policies)
49 | - fix pig gitlab-ci tests (JAVA_HOME)
50 | 
51 | python:
52 | - include input file name (and chunk? and CDX?) in sentry context
53 | - how to get argument (like --hbase-table) into mrjob.conf, or similar?
54 | 


--------------------------------------------------------------------------------
/extra/blobs/tasks.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Backfill GROBID XML to Blob Store
 3 | 
 4 | Initially ran this when spinning up new seaweedfs server to replace minio. At
 5 | this time grobid persist worker was in db-only mode, as minio was too slow to
 6 | accept uploads. Rough plan is to:
 7 | 
 8 | 1. run grobid persist worker from Kafka with a new temporary consumer group,
 9 |    from the start of the GROBID output topic
10 | 2. when it gets to end, stop the *regular* consumer group while this one is
11 |    still running. with temporary worker still running, at that point in time
12 |    entire topic should be in S3
13 | 3. then reconfigure regular worker to db+s3 mode. halt the temporary worker,
14 |    restart the regular one with new config, run it indefinitely
15 | 
16 | Consumer group isn't an arg, so just edit `persist_worker.py` and set it to
17 | `persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode
18 | didn't try to connect to postgresql.
19 | 
20 | Commands:
21 | 
22 |     ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
23 |     => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
24 |     => run briefly, then kill
25 | 
26 | On kafka-broker worker:
27 | 
28 |     ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run
29 | 
30 | Then run 2x instances of worker (same command as above):
31 | 
32 |     ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
33 | 
34 | At this point CPU-limited on this worker by the python processes (only 4 cores
35 | on this machine).
36 | 
37 | Check in weed shell:
38 | 
39 |     weed shell
40 | 
41 |     > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml
42 |     [...]
43 |             "isGzipped": false
44 |     [...]
45 |             "mime": "application/xml",
46 |     [...]
47 | 
48 | An open question is if we should have separate buckets per derive type. Eg, a
49 | GROBID XML bucket separate from thumbnails bucket. Or are prefix directories
50 | enough. Basically this comes down to whether we want things mixed together at
51 | the volume level. I think we should keep separate.
52 | 
53 | Need to set the mimetype in the upload for gzip on XML?
54 | 


--------------------------------------------------------------------------------