├── .gitignore
├── LICENSE.txt
├── README.md
├── batchlite
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── tallison
    │           │       └── batchlite
    │           │           ├── AbstractDirectoryProcessor.java
    │           │           ├── AbstractFileProcessor.java
    │           │           ├── CommandlineFileProcessor.java
    │           │           ├── CommandlineFileToFileProcessor.java
    │           │           ├── CommandlineStdoutToFileProcessor.java
    │           │           ├── ConfigSrc.java
    │           │           ├── FileProcessResult.java
    │           │           ├── FileProcessor.java
    │           │           ├── FileToFileProcessor.java
    │           │           ├── MetadataWriter.java
    │           │           ├── ProcessExecutor.java
    │           │           ├── StreamEater.java
    │           │           ├── example
    │           │               ├── FileCommandExample.java
    │           │               ├── PDFChecker.java
    │           │               └── PDFStdoutChecker.java
    │           │           └── writer
    │           │               ├── CSVMetadataWriter.java
    │           │               ├── JDBCMetadataWriter.java
    │           │               ├── JSONMetadataWriter.java
    │           │               ├── MetadataWriterFactory.java
    │           │               ├── PathResultPair.java
    │           │               └── WriterResult.java
    │       └── resources
    │           └── log4j2.xml
├── commoncrawl-fetcher
    ├── pom.xml
    └── src
    │   ├── README.txt
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   └── tallison
    │       │   │       ├── cc
    │       │   │           ├── CCFileFetcher.java
    │       │   │           ├── CCIndexReaderCounter.java
    │       │   │           ├── CCIndexWGetter.java
    │       │   │           ├── IndexGrep.java
    │       │   │           ├── Refetcher.java
    │       │   │           ├── S3IndexGetter.java
    │       │   │           ├── fetcherlite
    │       │   │           │   ├── CCFileFetcherLiteCLI.java
    │       │   │           │   ├── FetchLiteRecordProcessor.java
    │       │   │           │   ├── FetcherLiteConfig.java
    │       │   │           │   └── FileFromCCWarcFetcher.java
    │       │   │           ├── index
    │       │   │           │   ├── AbstractRecordProcessor.java
    │       │   │           │   ├── CCIndexRecord.java
    │       │   │           │   ├── CCIndexWGetter.java
    │       │   │           │   ├── CompositeRecordFilter.java
    │       │   │           │   ├── IndexFileChecker.java
    │       │   │           │   ├── IndexRecordProcessor.java
    │       │   │           │   ├── LatLongAdder.java
    │       │   │           │   ├── MimeCounter.java
    │       │   │           │   ├── RecordFilter.java
    │       │   │           │   └── db
    │       │   │           │   │   ├── DBIndexer.java
    │       │   │           │   │   └── DBIndexerCLI.java
    │       │   │           └── pipes
    │       │   │           │   └── CCIndexPipesIterator.java
    │       │   │       └── util
    │       │   │           ├── DBUtil.java
    │       │   │           ├── HTTPFetchWrapper.java
    │       │   │           ├── HostUpsert.java
    │       │   │           ├── MapUtil.java
    │       │   │           └── ReloadFetchStatusTable.java
    │       └── resources
    │       │   ├── log4j2.xml
    │       │   ├── selectFetchAndFetchStatus.sql
    │       │   ├── selectFilesToFetchFromCC.sql
    │       │   ├── selectFilesToFetchPerWarcId.sql
    │       │   ├── selectIndexedAndFetchedData.sql
    │       │   ├── selectIndexedData.sql
    │       │   └── selectWarcFileIdsToFetchFromCC.sql
    │   └── test
    │       ├── java
    │           ├── CCIndexRecordTest.java
    │           ├── CompositeRecordFilterTest.java
    │           └── FetcherTest.java
    │       └── resources
    │           ├── examples
    │               ├── mpeg-filters.json
    │               ├── tika-config-fetch-fs.xml
    │               ├── tika-config-index-fs.xml
    │               ├── tika-config-index-s3.xml
    │               └── tika-config-refetch-fs.xml
    │           └── test-documents
    │               ├── mime-filters-av.json
    │               ├── mime-filters.json
    │               ├── pdf-filter-sample.json
    │               ├── pdf-filter.json
    │               ├── status-filter.json
    │               └── status-sample-filter.json
├── ingest-jdbc
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── org
    │               └── tallison
    │                   └── ingest
    │                       └── arlington
    │                           └── ArlingtonIngest.java
├── ingest
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   └── tallison
    │       │   │       └── ingest
    │       │   │           ├── CompositeFeatureMapper.java
    │       │   │           ├── FeatureMapper.java
    │       │   │           ├── IngesterCLI.java
    │       │   │           ├── IngesterToCSVCLI.java
    │       │   │           ├── mappers
    │       │   │               ├── ArlingtonMapper.java
    │       │   │               ├── CPUMapper.java
    │       │   │               ├── CaradocMapper.java
    │       │   │               ├── ClamAVMapper.java
    │       │   │               ├── ESUtil.java
    │       │   │               ├── MultiCompareMapper.java
    │       │   │               ├── MutoolMapper.java
    │       │   │               ├── PDFBytesMapper.java
    │       │   │               ├── PDFCheckerMapper.java
    │       │   │               ├── PDFFontsMapper.java
    │       │   │               ├── PDFInfoFeatureMapper.java
    │       │   │               ├── PDFMinerMapper.java
    │       │   │               ├── PDFResurrectMapper.java
    │       │   │               ├── ProfileFeatureMapper.java
    │       │   │               ├── QPDFFeatureMapper.java
    │       │   │               ├── StatusFeatureMapper.java
    │       │   │               ├── TikaFeatureMapper.java
    │       │   │               ├── UniverseMapper.java
    │       │   │               └── XPDFFontsMapper.java
    │       │   │           ├── qpdf
    │       │   │               ├── QPDFJsonExtractor.java
    │       │   │               └── QPDFResults.java
    │       │   │           ├── qpdf10
    │       │   │               └── qpdf
    │       │   │               │   ├── QPDFJsonExtractor.java
    │       │   │               │   └── QPDFResults.java
    │       │   │           └── utils
    │       │   │               ├── CSVsToPostgres.java
    │       │   │               ├── ESToCSV.java
    │       │   │               └── FindMissing.java
    │       └── resources
    │       │   ├── META-INF
    │       │       └── services
    │       │       │   └── org.tallison.ingest.FeatureMapper
    │       │   ├── common-keys.txt
    │       │   ├── important-int-keys.txt
    │       │   ├── log4j.properties
    │       │   ├── observatory-mappings.json
    │       │   ├── selectStar-dev.sql
    │       │   ├── selectStar-lite.sql
    │       │   ├── selectStar-minimal.sql
    │       │   ├── selectStar-sample.sql
    │       │   └── selectStar.sql
    │   └── test
    │       ├── java
    │           └── org
    │           │   └── tallison
    │           │       └── ingest
    │           │           └── mappers
    │           │               ├── ArlingtonMapperTest.java
    │           │               ├── MapperTest.java
    │           │               ├── PDFCheckerMapperTest.java
    │           │               ├── PDFFontsMapperTest.java
    │           │               ├── PDFInfoMapperTest.java
    │           │               ├── QPDF10JsonExtractorTest.java
    │           │               ├── QPDFJsonExtractorTest.java
    │           │               └── XPDFFontsMapperTest.java
    │       └── resources
    │           └── test-documents
    │               ├── GHOSTSCRIPT-687771-0.pdf.json
    │               ├── GHOSTSCRIPT-690371-0.pdf.json
    │               ├── GHOSTSCRIPT-702993-0.pdf.json
    │               ├── arlington
    │                   ├── GHOSTSCRIPT-687499-0.pdf.txt
    │                   ├── GHOSTSCRIPT-687647-0.pdf.txt
    │                   └── GHOSTSCRIPT-688076-1.pdf.txt
    │               ├── pdfchecker
    │                   ├── GHOSTSCRIPT-696838-0.zip-0.pdf.json
    │                   └── fonts-PDFBOX-1002-2.pdf.json
    │               ├── pdffonts
    │                   └── test-basic.txt
    │               ├── qpdfv11
    │                   └── qpdf.json
    │               ├── simple.json
    │               ├── types.json
    │               └── xpdffonts
    │                   └── test-basic.txt
├── pom.xml
├── simple-ingester
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── org
    │               └── tallison
    │                   ├── ingester
    │                       └── IngesterCLI.java
    │                   └── tika
    │                       └── parser
    │                           ├── ConcatenatingParser.java
    │                           └── TikaServerClient.java
├── tika-addons
    ├── pom.xml
    ├── tika-eval-multicomparer
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── org
    │   │               └── tallison
    │   │                   └── tika
    │   │                       └── eval
    │   │                           └── multi
    │   │                               ├── ListGenerator.java
    │   │                               ├── MultiCompareWorker.java
    │   │                               └── MultiComparerCLI.java
    ├── tika-pipes-reporter
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── org
    │   │               └── tallison
    │   │                   └── tika
    │   │                       └── pipes
    │   │                           └── TikaPipesReporter.java
    └── tika-server-fuzzer
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── FuzzClient.java
├── tika-containers
    ├── pom.xml
    ├── tika-arlington
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── org
    │   │               └── tallison
    │   │                   └── observatory
    │   │                       └── RegexCaptureParser.java
    ├── tika-exiftool
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   └── pom.xml
    ├── tika-pdfchecker
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   ├── pom.xml
    │   ├── src
    │   │   ├── main
    │   │   │   ├── java
    │   │   │   │   └── org
    │   │   │   │   │   └── tallison
    │   │   │   │   │       └── tika
    │   │   │   │   │           └── parsers
    │   │   │   │   │               └── pdfchecker
    │   │   │   │   │                   └── PDFChecker.java
    │   │   │   └── resources
    │   │   │   │   └── META-INF
    │   │   │   │       └── services
    │   │   │   │           └── org.apache.tika.parser.Parser
    │   │   └── test
    │   │   │   ├── java
    │   │   │       └── TikaPDFToTextTest.java
    │   │   │   └── resources
    │   │   │       └── test-documents
    │   │   │           └── testPDF.pdf
    │   └── tika-server-core-2.0.0-SNAPSHOT.jar
    ├── tika-pdfium
    │   └── my-args.gn
    ├── tika-pdfjs-selenium
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── FirefoxSeleniumExample.java
    ├── tika-pdfjs
    │   ├── Dockerfile
    │   ├── js
    │   │   └── my-getinfo.js
    │   ├── my-tika-config.xml
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       └── java
    │   │       │   └── org
    │   │       │       └── tallison
    │   │       │           └── observatory
    │   │       │               └── pdfjs
    │   │       │                   └── PDFJSOutputParser.java
    │   │   └── test
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── observatory
    │   │           │           └── pdfjs
    │   │           │               └── PDFJSOutputParserTest.java
    │   │       └── resources
    │   │           └── test-documents
    │   │               ├── test-basic.txt
    │   │               ├── test-xmp.txt
    │   │               └── test-xmp2.txt
    ├── tika-pdfspelunker
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       ├── java
    │   │       │   └── org
    │   │       │   │   └── tallison
    │   │       │   │       └── tika
    │   │       │   │           ├── parsers
    │   │       │   │               ├── image
    │   │       │   │               │   ├── ICCImageParser.java
    │   │       │   │               │   └── IccMaxParser.java
    │   │       │   │               └── pdf
    │   │       │   │               │   ├── ImageGraphicsEngine.java
    │   │       │   │               │   ├── PDFImageStreamUtil.java
    │   │       │   │               │   ├── PDFSpelunker.java
    │   │       │   │               │   └── ParseState.java
    │   │       │   │           └── spelunker
    │   │       │   │               └── tools
    │   │       │   │                   └── ExtractICCs.java
    │   │       └── resources
    │   │       │   ├── META-INF
    │   │       │       └── services
    │   │       │       │   └── org.apache.tika.parser.Parser
    │   │       │   └── org
    │   │       │       └── apache
    │   │       │           └── tika
    │   │       │               └── mime
    │   │       │                   └── custom-mimetypes.xml
    │   │   └── test
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── tika
    │   │           │           └── parsers
    │   │           │               ├── image
    │   │           │                   └── ICCImageParserTest.java
    │   │           │               └── pdf
    │   │           │                   └── PDFSpelunkerTest.java
    │   │       └── resources
    │   │           ├── config
    │   │               └── my-tika-config.xml
    │   │           └── test-documents
    │   │               ├── baseball.jpg
    │   │               ├── icc-reports
    │   │                   ├── non-compliant1.txt
    │   │                   ├── not-icc1.txt
    │   │                   └── not-icc2.txt
    │   │               └── testPDF.pdf
    ├── tika-pdftotext
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   └── pom.xml
    ├── tika-pipes-pdfinfo
    │   ├── Dockerfile
    │   ├── log4j2.xml
    │   ├── my-tika-config.xml
    │   ├── pipes-log4j2.xml
    │   └── pom.xml
    ├── tika-pipes-siegfried
    │   ├── Dockerfile
    │   ├── log4j2.xml
    │   ├── my-tika-config.xml
    │   ├── pipes-log4j2.xml
    │   └── pom.xml
    └── tika-pypdf2
    │   ├── Dockerfile
    │   ├── my-tika-config.xml
    │   ├── pom.xml
    │   └── scripts
    │       └── PyPDF2Cli.py
├── tool-runners
    ├── arlington
    │   ├── Dockerfile
    │   ├── env.properties
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── arlington
    │   │           │               └── TestGrammarRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── caradoc
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── caradoc
    │   │           │               └── Caradoc.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── clamav
    │   ├── Dockerfile
    │   ├── conf
    │   │   ├── clam.conf
    │   │   └── freshclam.conf
    │   ├── exec.sh
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── clamav
    │   │           │               └── ClamAVRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── env.properties
    ├── fileprofiler
    │   ├── Dockerfile
    │   ├── README.txt
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── profiler
    │   │           │               └── FileProfiler.java
    │   │       └── resources
    │   │           └── log4j2.xml
    ├── gstotext
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── gstotext
    │   │           │               └── GhostScriptToTextRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── itext
    │   ├── README.md
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       ├── java
    │   │       │   └── org
    │   │       │   │   └── tallison
    │   │       │   │       └── tika
    │   │       │   │           └── parser
    │   │       │   │               └── itext
    │   │       │   │                   └── ITextParser.java
    │   │       └── resources
    │   │       │   └── META-INF
    │   │       │       └── services
    │   │       │           └── org.apache.tika.parser.Parser
    │   │   └── test
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── tika
    │   │           │           └── parser
    │   │           │               └── itext
    │   │           │                   └── ITextParserTest.java
    │   │       └── resources
    │   │           └── test-documents
    │   │               └── testPDF.pdf
    ├── mutoolclean
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── mutool
    │   │           │               └── MutoolClean.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── mutooltext
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── mutool
    │   │           │               └── MutoolTextRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfbytes
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       └── java
    │   │       │   └── org
    │   │       │       └── tallison
    │   │       │           └── pdfutils
    │   │       │               ├── PDFByteSniffer.java
    │   │       │               ├── PDFVersionator.java
    │   │       │               └── StreamSearcher.java
    │   │   └── test
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── pdfutils
    │   │           │           └── TestVersionUnpacker.java
    │   │       └── resources
    │   │           └── pdf-puzzle.pdf
    ├── pdfchecker
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfchecker
    │   │           │               └── PDFCheckerRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfcpu
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfcpu
    │   │           │               └── PDFCPURunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdffonts
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdffonts
    │   │           │               └── PDFFontsRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfid
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfid
    │   │           │               └── PDFIdRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfimages
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfimages
    │   │           │               └── PDFImagesRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfinfo
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfinfo
    │   │           │               └── PDFInfo.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfminerdump
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfminer
    │   │           │               └── PDFMinerDump.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfminertext
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfminer
    │   │           │               └── PDFMinerText.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdfresurrect
    │   ├── Dockerfile
    │   ├── env.properties
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdfresurrect
    │   │           │               └── PDFResurrect.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdftoppm
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdftoppm
    │   │           │               └── PDFToPPMRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdftops
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdftops
    │   │           │               └── PDFToPSRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pdftotext
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── pdftotext
    │   │           │               └── PDFToTextRunner.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── polyfile
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── polyfile
    │   │           │               ├── PolyFile.java
    │   │           │               └── PolyFilePolyglot.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── pom.xml
    ├── qpdf
    │   ├── Dockerfile
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── qpdf
    │   │           │               └── QPDFToJson.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── tika-client
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── tika
    │   │           │           └── client
    │   │           │               ├── TikaClient.java
    │   │           │               └── TikaLoadTester.java
    │   │       └── resources
    │   │           └── log4j2.xml
    ├── tika
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── org
    │   │           │   └── tallison
    │   │           │       └── fileutils
    │   │           │           └── tika
    │   │           │               └── TikaBatch.java
    │   │       └── resources
    │   │           └── log4j2.xml
    └── xpdffonts
    │   ├── Dockerfile
    │   ├── pom.xml
    │   ├── src
    │       └── main
    │       │   ├── java
    │       │       └── org
    │       │       │   └── tallison
    │       │       │       └── fileutils
    │       │       │           └── pdffonts
    │       │       │               └── PDFFontsRunner.java
    │       │   └── resources
    │       │       └── log4j.properties
    │   ├── tgzs
    │       ├── xpdf-arabic.tar.gz
    │       ├── xpdf-arabic
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-chinese-simplified.tar.gz
    │       ├── xpdf-chinese-simplified
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-chinese-traditional.tar.gz
    │       ├── xpdf-chinese-traditional
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-cyrillic.tar.gz
    │       ├── xpdf-cyrillic
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-greek.tar.gz
    │       ├── xpdf-greek
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-hebrew.tar.gz
    │       ├── xpdf-hebrew
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-japanese.tar.gz
    │       ├── xpdf-japanese
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-korean.tar.gz
    │       ├── xpdf-korean
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-latin2.tar.gz
    │       ├── xpdf-latin2
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-t1fonts.tar.gz
    │       ├── xpdf-t1fonts
    │       │   ├── COPYING
    │       │   ├── README
    │       │   ├── d050000l.pfb
    │       │   └── s050000l.pfb
    │       ├── xpdf-thai.tar.gz
    │       ├── xpdf-thai
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │       ├── xpdf-turkish.tar.gz
    │       └── xpdf-turkish
    │       │   ├── README
    │       │   └── add-to-xpdfrc
    │   ├── xpdf
    │       ├── arabic
    │       │   └── ISO-8859-6.unicodeMap
    │       ├── chinese-simplified
    │       │   ├── Adobe-GB1.cidToUnicode
    │       │   ├── CMap
    │       │   │   ├── Adobe-GB1-0
    │       │   │   ├── Adobe-GB1-1
    │       │   │   ├── Adobe-GB1-2
    │       │   │   ├── Adobe-GB1-3
    │       │   │   ├── Adobe-GB1-4
    │       │   │   ├── Adobe-GB1-5
    │       │   │   ├── Adobe-GB1-UCS2
    │       │   │   ├── GB-EUC-H
    │       │   │   ├── GB-EUC-V
    │       │   │   ├── GB-H
    │       │   │   ├── GB-V
    │       │   │   ├── GBK-EUC-H
    │       │   │   ├── GBK-EUC-UCS2
    │       │   │   ├── GBK-EUC-V
    │       │   │   ├── GBK2K-H
    │       │   │   ├── GBK2K-V
    │       │   │   ├── GBKp-EUC-H
    │       │   │   ├── GBKp-EUC-V
    │       │   │   ├── GBT-EUC-H
    │       │   │   ├── GBT-EUC-V
    │       │   │   ├── GBT-H
    │       │   │   ├── GBT-V
    │       │   │   ├── GBTpc-EUC-H
    │       │   │   ├── GBTpc-EUC-V
    │       │   │   ├── GBpc-EUC-H
    │       │   │   ├── GBpc-EUC-UCS2
    │       │   │   ├── GBpc-EUC-UCS2C
    │       │   │   ├── GBpc-EUC-V
    │       │   │   ├── LICENSE.md
    │       │   │   ├── UniGB-UCS2-H
    │       │   │   ├── UniGB-UCS2-V
    │       │   │   ├── UniGB-UTF16-H
    │       │   │   ├── UniGB-UTF16-V
    │       │   │   ├── UniGB-UTF32-H
    │       │   │   ├── UniGB-UTF32-V
    │       │   │   ├── UniGB-UTF8-H
    │       │   │   └── UniGB-UTF8-V
    │       │   ├── EUC-CN.unicodeMap
    │       │   ├── GBK.unicodeMap
    │       │   └── ISO-2022-CN.unicodeMap
    │       ├── chinese-traditional
    │       │   ├── Adobe-CNS1.cidToUnicode
    │       │   ├── Big5.unicodeMap
    │       │   ├── Big5ascii.unicodeMap
    │       │   └── CMap
    │       │   │   ├── Adobe-CNS1-0
    │       │   │   ├── Adobe-CNS1-1
    │       │   │   ├── Adobe-CNS1-2
    │       │   │   ├── Adobe-CNS1-3
    │       │   │   ├── Adobe-CNS1-4
    │       │   │   ├── Adobe-CNS1-5
    │       │   │   ├── Adobe-CNS1-6
    │       │   │   ├── Adobe-CNS1-7
    │       │   │   ├── Adobe-CNS1-UCS2
    │       │   │   ├── B5-H
    │       │   │   ├── B5-V
    │       │   │   ├── B5pc-H
    │       │   │   ├── B5pc-UCS2
    │       │   │   ├── B5pc-UCS2C
    │       │   │   ├── B5pc-V
    │       │   │   ├── CNS-EUC-H
    │       │   │   ├── CNS-EUC-V
    │       │   │   ├── CNS1-H
    │       │   │   ├── CNS1-V
    │       │   │   ├── CNS2-H
    │       │   │   ├── CNS2-V
    │       │   │   ├── ETHK-B5-H
    │       │   │   ├── ETHK-B5-V
    │       │   │   ├── ETen-B5-H
    │       │   │   ├── ETen-B5-UCS2
    │       │   │   ├── ETen-B5-V
    │       │   │   ├── ETenms-B5-H
    │       │   │   ├── ETenms-B5-V
    │       │   │   ├── HKdla-B5-H
    │       │   │   ├── HKdla-B5-V
    │       │   │   ├── HKdlb-B5-H
    │       │   │   ├── HKdlb-B5-V
    │       │   │   ├── HKgccs-B5-H
    │       │   │   ├── HKgccs-B5-V
    │       │   │   ├── HKm314-B5-H
    │       │   │   ├── HKm314-B5-V
    │       │   │   ├── HKm471-B5-H
    │       │   │   ├── HKm471-B5-V
    │       │   │   ├── HKscs-B5-H
    │       │   │   ├── HKscs-B5-V
    │       │   │   ├── LICENSE.md
    │       │   │   ├── UniCNS-UCS2-H
    │       │   │   ├── UniCNS-UCS2-V
    │       │   │   ├── UniCNS-UTF16-H
    │       │   │   ├── UniCNS-UTF16-V
    │       │   │   ├── UniCNS-UTF32-H
    │       │   │   ├── UniCNS-UTF32-V
    │       │   │   ├── UniCNS-UTF8-H
    │       │   │   └── UniCNS-UTF8-V
    │       ├── cyrillic
    │       │   ├── Bulgarian.nameToUnicode
    │       │   └── KOI8-R.unicodeMap
    │       ├── greek
    │       │   ├── Greek.nameToUnicode
    │       │   └── ISO-8859-7.unicodeMap
    │       ├── hebrew
    │       │   ├── ISO-8859-8.unicodeMap
    │       │   └── Windows-1255.unicodeMap
    │       ├── japanese
    │       │   ├── Adobe-Japan1.cidToUnicode
    │       │   ├── CMap
    │       │   │   ├── 78-EUC-H
    │       │   │   ├── 78-EUC-V
    │       │   │   ├── 78-H
    │       │   │   ├── 78-RKSJ-H
    │       │   │   ├── 78-RKSJ-V
    │       │   │   ├── 78-V
    │       │   │   ├── 78ms-RKSJ-H
    │       │   │   ├── 78ms-RKSJ-V
    │       │   │   ├── 83pv-RKSJ-H
    │       │   │   ├── 90ms-RKSJ-H
    │       │   │   ├── 90ms-RKSJ-UCS2
    │       │   │   ├── 90ms-RKSJ-V
    │       │   │   ├── 90msp-RKSJ-H
    │       │   │   ├── 90msp-RKSJ-V
    │       │   │   ├── 90pv-RKSJ-H
    │       │   │   ├── 90pv-RKSJ-UCS2
    │       │   │   ├── 90pv-RKSJ-UCS2C
    │       │   │   ├── 90pv-RKSJ-V
    │       │   │   ├── Add-H
    │       │   │   ├── Add-RKSJ-H
    │       │   │   ├── Add-RKSJ-V
    │       │   │   ├── Add-V
    │       │   │   ├── Adobe-Japan1-0
    │       │   │   ├── Adobe-Japan1-1
    │       │   │   ├── Adobe-Japan1-2
    │       │   │   ├── Adobe-Japan1-3
    │       │   │   ├── Adobe-Japan1-4
    │       │   │   ├── Adobe-Japan1-5
    │       │   │   ├── Adobe-Japan1-6
    │       │   │   ├── Adobe-Japan1-7
    │       │   │   ├── Adobe-Japan1-UCS2
    │       │   │   ├── EUC-H
    │       │   │   ├── EUC-V
    │       │   │   ├── Ext-H
    │       │   │   ├── Ext-RKSJ-H
    │       │   │   ├── Ext-RKSJ-V
    │       │   │   ├── Ext-V
    │       │   │   ├── H
    │       │   │   ├── Hankaku
    │       │   │   ├── Hiragana
    │       │   │   ├── Katakana
    │       │   │   ├── LICENSE.md
    │       │   │   ├── NWP-H
    │       │   │   ├── NWP-V
    │       │   │   ├── RKSJ-H
    │       │   │   ├── RKSJ-V
    │       │   │   ├── Roman
    │       │   │   ├── UniJIS-UCS2-H
    │       │   │   ├── UniJIS-UCS2-HW-H
    │       │   │   ├── UniJIS-UCS2-HW-V
    │       │   │   ├── UniJIS-UCS2-V
    │       │   │   ├── UniJIS-UTF16-H
    │       │   │   ├── UniJIS-UTF16-V
    │       │   │   ├── UniJIS-UTF32-H
    │       │   │   ├── UniJIS-UTF32-V
    │       │   │   ├── UniJIS-UTF8-H
    │       │   │   ├── UniJIS-UTF8-V
    │       │   │   ├── UniJIS2004-UTF16-H
    │       │   │   ├── UniJIS2004-UTF16-V
    │       │   │   ├── UniJIS2004-UTF32-H
    │       │   │   ├── UniJIS2004-UTF32-V
    │       │   │   ├── UniJIS2004-UTF8-H
    │       │   │   ├── UniJIS2004-UTF8-V
    │       │   │   ├── UniJISPro-UCS2-HW-V
    │       │   │   ├── UniJISPro-UCS2-V
    │       │   │   ├── UniJISPro-UTF8-V
    │       │   │   ├── UniJISX0213-UTF32-H
    │       │   │   ├── UniJISX0213-UTF32-V
    │       │   │   ├── UniJISX02132004-UTF32-H
    │       │   │   ├── UniJISX02132004-UTF32-V
    │       │   │   ├── V
    │       │   │   └── WP-Symbol
    │       │   ├── EUC-JP.unicodeMap
    │       │   ├── ISO-2022-JP.unicodeMap
    │       │   └── Shift-JIS.unicodeMap
    │       ├── korean
    │       │   ├── Adobe-KR.cidToUnicode
    │       │   ├── Adobe-Korea1.cidToUnicode
    │       │   ├── CMap
    │       │   │   ├── Adobe-KR-0
    │       │   │   ├── Adobe-KR-1
    │       │   │   ├── Adobe-KR-2
    │       │   │   ├── Adobe-KR-3
    │       │   │   ├── Adobe-KR-4
    │       │   │   ├── Adobe-KR-5
    │       │   │   ├── Adobe-KR-6
    │       │   │   ├── Adobe-KR-7
    │       │   │   ├── Adobe-KR-8
    │       │   │   ├── Adobe-KR-9
    │       │   │   ├── Adobe-Korea1-0
    │       │   │   ├── Adobe-Korea1-1
    │       │   │   ├── Adobe-Korea1-2
    │       │   │   ├── Adobe-Korea1-UCS2
    │       │   │   ├── KSC-EUC-H
    │       │   │   ├── KSC-EUC-V
    │       │   │   ├── KSC-H
    │       │   │   ├── KSC-Johab-H
    │       │   │   ├── KSC-Johab-V
    │       │   │   ├── KSC-V
    │       │   │   ├── KSCms-UHC-H
    │       │   │   ├── KSCms-UHC-HW-H
    │       │   │   ├── KSCms-UHC-HW-V
    │       │   │   ├── KSCms-UHC-UCS2
    │       │   │   ├── KSCms-UHC-V
    │       │   │   ├── KSCpc-EUC-H
    │       │   │   ├── KSCpc-EUC-UCS2
    │       │   │   ├── KSCpc-EUC-UCS2C
    │       │   │   ├── KSCpc-EUC-V
    │       │   │   ├── LICENSE.md
    │       │   │   ├── UniAKR-UTF16-H
    │       │   │   ├── UniAKR-UTF32-H
    │       │   │   ├── UniAKR-UTF8-H
    │       │   │   ├── UniKS-UCS2-H
    │       │   │   ├── UniKS-UCS2-V
    │       │   │   ├── UniKS-UTF16-H
    │       │   │   ├── UniKS-UTF16-V
    │       │   │   ├── UniKS-UTF32-H
    │       │   │   ├── UniKS-UTF32-V
    │       │   │   ├── UniKS-UTF8-H
    │       │   │   └── UniKS-UTF8-V
    │       │   └── ISO-2022-KR.unicodeMap
    │       ├── latin2
    │       │   └── Latin2.unicodeMap
    │       ├── thai
    │       │   ├── TIS-620.unicodeMap
    │       │   └── Thai.nameToUnicode
    │       └── turkish
    │       │   └── ISO-8859-9.unicodeMap
    │   └── xpdfrc
└── utils-general
    ├── pom.xml
    └── src
        ├── main
            └── java
            │   └── org
            │       └── tallison
            │           ├── db
            │               ├── CustomCSVToPG.java
            │               ├── ExtractsToDB.java
            │               ├── FetchFilesFromDBPaths.java
            │               └── PGToCSV.java
            │           ├── digest
            │               ├── CSVLineCounter.java
            │               ├── CompareLists.java
            │               ├── DigestChecker.java
            │               ├── FileListNormalizer.java
            │               ├── RemoveExtras.java
            │               ├── S3Compare.java
            │               ├── S3DigestChecker.java
            │               └── S3ListCompare.java
            │           ├── filter
            │               ├── CopyByMime.java
            │               └── CopyFilterDigest.java
            │           └── pdf
            │               └── utils
            │                   └── PDFSplitter.java
        └── test
            └── java
                └── org
                    └── tallison
                        └── pdf
                            └── utils
                                └── TestPDFSplitter.java


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | *.iml
4 | /tool-runners/pdfchecker/pdf-checker.tgz
5 | /tool-runners/arlington
6 | /tool-runners/arlington/grammar/
7 | /tika-containers/tika-pdfchecker/pdf-checker.tgz
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # File Observatory
 2 | This repo hosts development code used on the backend to support data ingestion into
 3 | an ElasticSearch index for the [SafeDocs File Observatory app](https://github.com/jpl-safedocs).
 4 | 
 5 | This repo contains pre-ALPHA grade code for demonstration purposes only.
 6 | 
 7 | Some capabilities demonstrated within have been integrated into Apache Tika. 
 8 | Some have been spun off into standalone projects, e.g. [commoncrawl-fetcher-lite](https://github.com/tballison/commoncrawl-fetcher-lite).
 9 | 
10 | # Attribution
11 | The commoncrawl-fetcher module includes code that relies on GeoLite2 data created by MaxMind, available from
12 | [https://www.maxmind.com](https://www.maxmind.com).


--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/JSONMetadataWriter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.batchlite.writer;
18 | 
19 | import com.google.gson.Gson;
20 | import org.tallison.batchlite.MetadataWriter;
21 | 
22 | import java.io.IOException;
23 | import java.nio.charset.StandardCharsets;
24 | import java.nio.file.Files;
25 | import java.nio.file.Path;
26 | 
27 | public class JSONMetadataWriter extends MetadataWriter {
28 | 
29 |     private final static Gson GSON = new Gson();
30 | 
31 |     private final Path metadataRootDir;
32 | 
33 |     public JSONMetadataWriter(String name,
34 |                               Path metadataRootDir, int stdoutLimit, int stderrLimit) {
35 |         super(name, stdoutLimit, stderrLimit);
36 |         this.metadataRootDir = metadataRootDir;
37 |     }
38 | 
39 |     @Override
40 |     protected void write(PathResultPair pair) throws IOException {
41 |         Path target = metadataRootDir.resolve(pair.getRelPath() + ".json");
42 |         if (! Files.isDirectory(target.getParent())) {
43 |             Files.createDirectories(target.getParent());
44 |         }
45 |         Files.write(target, GSON.toJson(pair.getResult()).getBytes(StandardCharsets.UTF_8));
46 |     }
47 | 
48 |     @Override
49 |     public void close() throws IOException {
50 |         //no-op
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/MetadataWriterFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.batchlite.writer;
18 | 
19 | import org.tallison.batchlite.MetadataWriter;
20 | 
21 | import java.io.IOException;
22 | import java.nio.file.Paths;
23 | 
24 | public class MetadataWriterFactory {
25 | 
26 |     public static MetadataWriter build(String name, String writerString,
27 |                                        boolean isDelta,
28 |                                        int maxStdout, int maxStderr) throws IOException {
29 |         if (writerString.startsWith("jdbc:")) {
30 |             return new JDBCMetadataWriter(name, writerString, isDelta, maxStdout, maxStderr);
31 |         } else if (writerString.endsWith(".csv") || writerString.endsWith(".tsv")) {
32 |             return new CSVMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr);
33 |         } else {
34 |             return new JSONMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr);
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/PathResultPair.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.batchlite.writer;
18 | 
19 | import org.tallison.batchlite.FileProcessResult;
20 | 
21 | import java.nio.file.Path;
22 | 
23 | public class PathResultPair {
24 | 
25 | 
26 | 
27 |     private final String relPath;
28 |     private final FileProcessResult result;
29 | 
30 |     public PathResultPair(String relPath, FileProcessResult result) {
31 |         this.relPath = relPath;
32 |         this.result = result;
33 |     }
34 | 
35 |     public String getRelPath() {
36 |         return relPath;
37 |     }
38 | 
39 |     public FileProcessResult getResult() {
40 |         return result;
41 |     }
42 | 
43 |     @Override
44 |     public String toString() {
45 |         return "PathResultPair{" +
46 |                 "relPath='" + relPath + '\'' +
47 |                 ", result=" + result +
48 |                 '}';
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/WriterResult.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.batchlite.writer;
 2 | 
 3 | public class WriterResult {
 4 | 
 5 |     private final int recordsWritten;
 6 |     public WriterResult(int recordsWritten) {
 7 |         this.recordsWritten = recordsWritten;
 8 |     }
 9 |     public int getRecordsWritten() {
10 |         return recordsWritten;
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/batchlite/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="WARN">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="info">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/README.txt:
--------------------------------------------------------------------------------
 1 | This is a set of utilities for extracting files from Common Crawl.
 2 | 
 3 | The assumption is that you don't have direct access to S3 and you
 4 | need to pull data.
 5 | 
 6 | Step 1:
 7 |   * Download the 300 index .gz files (this is normally ~1 TB)
 8 | 
 9 | Step 2:
10 |   * Read through the .gz files and index into postgres those files that
11 |     meet certain criteria (maybe just PDFs, etc)
12 | 
13 | Step 3:
14 |   * Based on the records in the database, request the warc file from AWS for
15 |     each file
16 |   * Extract the literal bytes from that file and index some more data from the warc
17 | 
18 | Step 4:
19 |   * For each file that CC identified as truncated, go back to the original URL and try
20 |     to retrieve the file from there.


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/CCIndexReaderCounter.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.cc;
 2 | 
 3 | import java.util.concurrent.atomic.AtomicLong;
 4 | 
 5 | public class CCIndexReaderCounter {
 6 |     AtomicLong recordsRead = new AtomicLong(0);
 7 |     AtomicLong filesExtracted = new AtomicLong(0);
 8 |     AtomicLong truncatedWritten = new AtomicLong(0);
 9 | 
10 |     public AtomicLong getRecordsRead() {
11 |         return recordsRead;
12 |     }
13 | 
14 |     public AtomicLong getFilesExtracted() {
15 |         return filesExtracted;
16 |     }
17 | 
18 |     public AtomicLong getTruncatedWritten() {
19 |         return truncatedWritten;
20 |     }
21 | 
22 |     @Override
23 |     public String toString() {
24 |         return "CCIndexReaderCounter{" +
25 |                 "recordsRead=" + recordsRead +
26 |                 ", filesExtracted=" + filesExtracted +
27 |                 ", truncatedWritten=" + truncatedWritten +
28 |                 '}';
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/index/IndexRecordProcessor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.tallison.cc.index;
19 | 
20 | 
21 | import java.io.IOException;
22 | 
23 | public interface IndexRecordProcessor {
24 | 
25 |     public void init(String[] args) throws Exception;
26 | 
27 |     public boolean process(String json) throws IOException;
28 | 
29 |     public void close() throws IOException;
30 | }
31 | 


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/index/RecordFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.cc.index;
18 | 
19 | public interface RecordFilter {
20 | 
21 |     boolean accept(CCIndexRecord record);
22 | }
23 | 


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/util/MapUtil.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.util;
18 | 
19 | import java.util.Collections;
20 | import java.util.Comparator;
21 | import java.util.LinkedHashMap;
22 | import java.util.LinkedList;
23 | import java.util.List;
24 | import java.util.Map;
25 | 
26 | public class MapUtil {
27 |     public static <K extends Comparable<? super K>,
28 |             V extends Comparable<? super V>> Map<K, V> sortByDescendingValue(Map<K, V> map ) {
29 |         List<Map.Entry<K, V>> list =
30 |                 new LinkedList<>( map.entrySet() );
31 |         Collections.sort( list, new Comparator<Map.Entry<K, V>>() {
32 |             @Override
33 |             public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2 )
34 |             {
35 |                 int c =  o2.getValue().compareTo(o1.getValue());
36 |                 if (c == 0) {
37 |                     return o1.getKey().compareTo(o2.getKey());
38 |                 }
39 |                 return c;
40 |             }
41 |         } );
42 | 
43 |         Map<K, V> result = new LinkedHashMap<>();
44 |         for (Map.Entry<K, V> entry : list)
45 |         {
46 |             result.put( entry.getKey(), entry.getValue() );
47 |         }
48 |         return result;
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/util/ReloadFetchStatusTable.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.tallison.util;
18 | 
19 | import java.sql.Connection;
20 | import java.sql.DriverManager;
21 | import java.sql.Statement;
22 | 
23 | import org.tallison.cc.CCFileFetcher;
24 | 
25 | /**
26 |  * For dev use only.  This loads a new status table for when there are changes
27 |  * to CCFileFetcher.STATUS
28 |  */
29 | public class ReloadFetchStatusTable {
30 | 
31 |     public static void main(String[] args) throws Exception {
32 |         Connection connection = DriverManager.getConnection(args[0]);
33 |         try (Statement st = connection.createStatement()) {
34 |             String sql = "drop table if exists cc_fetch_status";
35 |             st.execute(sql);
36 | 
37 |             sql = "create table cc_fetch_status " + "(id integer primary key, status varchar(64));";
38 |             st.execute(sql);
39 | 
40 | 
41 |             for (CCFileFetcher.FETCH_STATUS status : CCFileFetcher.FETCH_STATUS.values()) {
42 | 
43 |                 sql = "insert into cc_fetch_status values (" + status.ordinal() + ",'" +
44 |                         status.name() + "');";
45 |                 st.execute(sql);
46 |             }
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="WARN">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="info">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFetchAndFetchStatus.sql:
--------------------------------------------------------------------------------
1 | select f.id, f.fetched_digest, f.fetched_length, s.status
2 | from cc_fetch f
3 | join cc_fetch_status s on f.status_id=s.id


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFilesToFetchFromCC.sql:
--------------------------------------------------------------------------------
 1 | --limited query used to pull enough info back to
 2 | --extract the right files from common crawl's warc files
 3 | select u.id,
 4 | digest as cc_index_digest,
 5 | w.name as warc_file_name,
 6 | warc_offset, warc_length
 7 | from cc_urls u
 8 | join cc_warc_file_name w on u.warc_file_name = w.id
 9 | join cc_truncated t on u.truncated = t.id
10 | left join cc_fetch f on f.id = u.id
11 | where f.id is null and u.status = 200 and length(t.name) = 0
12 | order by w.name, warc_offset


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFilesToFetchPerWarcId.sql:
--------------------------------------------------------------------------------
 1 | select u.id,
 2 | digest as cc_index_digest,
 3 | w.name as warc_file_name,
 4 | warc_offset, warc_length,
 5 | t.name as cc_truncated
 6 | from cc_urls u
 7 | join cc_warc_file_name w on u.warc_file_name = w.id
 8 | join cc_truncated t on u.truncated = t.id
 9 | left join cc_fetch f on f.id = u.id
10 | where f.id is null and u.status = 200
11 | order by w.id, warc_offset


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectIndexedAndFetchedData.sql:
--------------------------------------------------------------------------------
 1 | --full query of the useful information gathered
 2 | --from the indices
 3 | select u.id, url,
 4 | digest as cc_index_digest,
 5 | f.fetched_digest,
 6 | u.status as http_status,
 7 | m.name as mime,
 8 | dm.name as detected_mime,
 9 | t.name as truncated,
10 | w.name as warc_file_name,
11 | warc_offset, warc_length,
12 | l.name as languages,
13 | f.fetched_length,
14 | s.status as fetched_status
15 | from cc_urls u
16 | join cc_warc_file_name w on u.warc_file_name = w.id
17 | join cc_mimes m on u.mime = m.id
18 | join cc_detected_mimes dm on u.detected_mime=dm.id
19 | join cc_truncated t on u.truncated = t.id
20 | join cc_languages l on u.languages = l.id
21 | left join cc_fetch f on f.id=u.id
22 | left join cc_fetch_status s on f.status_id=s.id
23 | where u.status = 200 and length(t.name) = 0
24 | order by w.name, warc_offset


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectIndexedData.sql:
--------------------------------------------------------------------------------
 1 | --full query of the useful information gathered
 2 | --from the indices
 3 | select u.id, url,
 4 | digest as cc_index_digest,
 5 | status as http_status,
 6 | m.name as mime,
 7 | dm.name as detected_mime,
 8 | t.name as truncated,
 9 | w.name as warc_file_name,
10 | warc_offset, warc_length,
11 | l.name as languages
12 | from cc_urls u
13 | join cc_warc_file_name w on u.warc_file_name = w.id
14 | join cc_mimes m on u.mime = m.id
15 | join cc_detected_mimes dm on u.detected_mime=dm.id
16 | join cc_truncated t on u.truncated = t.id
17 | join cc_languages l on u.languages = l.id
18 | where status = 200 and length(t.name) = 0
19 | order by w.name, warc_offset


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectWarcFileIdsToFetchFromCC.sql:
--------------------------------------------------------------------------------
1 | select w.id
2 | from cc_warc_file_name w
3 | order by w.id


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/mpeg-filters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "status": [200,300,400],
 3 |   "exact" : {
 4 |     "detected_mimes": [
 5 |       "video/mp4",
 6 |       "video/quicktime"
 7 |     ],
 8 |     "case_sensitive" : false
 9 |   }
10 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-fetch-fs.xml:
--------------------------------------------------------------------------------
 1 | <properties>
 2 |   <pipesIterator class="org.apache.tika.pipes.pipesiterator.jdbc.JDBCPipesIterator">
 3 |     <params>
 4 |       <idColumn>id</idColumn>
 5 |       <fetchKeyColumn>warc_file_name</fetchKeyColumn>
 6 |       <fetchKeyRangeStartColumn>warc_offset</fetchKeyRangeStartColumn>
 7 |       <fetchKeyRangeEndColumn>warc_end_offset</fetchKeyRangeEndColumn>
 8 |       <fetcherName>hf</fetcherName>
 9 |       <emitterName>fse</emitterName>
10 |       <connection>jdbc:sqlite:/Users/allison/Desktop/demo-backup.db</connection>
11 |       <!--limited query used to pull enough info back to
12 |       extract the right files from common crawl's warc files -->
13 |       <!-- we include the -1 to calculate the length, but the
14 |           warc reader appears to do the right thing at the moment if
15 |           the -1 is not included -->
16 |       <select>
17 |         select u.id as id,
18 |         digest as cc_index_digest,
19 |         'https://data.commoncrawl.org/'||w.name as warc_file_name,
20 |         warc_offset,
21 |         warc_offset + warc_length - 1 as warc_end_offset
22 |         from cc_urls u
23 |         join cc_warc_file_name w on u.warc_file_name = w.id
24 |         join cc_truncated t on u.truncated = t.id
25 |         left join cc_fetch f on f.id = u.id
26 |         where f.id is null and u.status = 200 and length(t.name) = 0
27 |  --       where f.id is null and u.status = 200 and t.name = 'length'
28 |         order by w.name, warc_offset
29 | --        limit 100
30 |       </select>
31 |     </params>
32 |   </pipesIterator>
33 |   <fetchers>
34 |     <fetcher class="org.apache.tika.pipes.fetcher.http.HttpFetcher">
35 |       <params>
36 |         <name>hf</name>
37 |       </params>
38 |     </fetcher>
39 |   </fetchers>
40 |   <emitters>
41 |     <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
42 |       <params>
43 |         <name>fse</name>
44 |         <basePath>/Users/allison/data/cc/docs</basePath>
45 |         <onExists>skip</onExists>
46 |       </params>
47 |     </emitter>
48 |   </emitters>
49 | </properties>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-index-fs.xml:
--------------------------------------------------------------------------------
 1 | <properties>
 2 |   <pipesIterator class="org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator">
 3 |     <params>
 4 |       <fetcherName>fs1</fetcherName>
 5 |       <basePath>/Users/allison/data/cc/CC-MAIN-2022-27</basePath>
 6 |     </params>
 7 |   </pipesIterator>
 8 |   <fetchers>
 9 |     <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
10 |       <params>
11 |         <name>fs1</name>
12 |         <basePath>/Users/allison/data/cc/CC-MAIN-2022-27</basePath>
13 |       </params>
14 |     </fetcher>
15 |   </fetchers>
16 | </properties>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-index-s3.xml:
--------------------------------------------------------------------------------
 1 | <properties>
 2 |   <pipesIterator class="org.tallison.cc.pipes.CCIndexPipesIterator">
 3 |     <params>
 4 |       <fetcherName>fs1</fetcherName>
 5 |       <indexPathsUrls>
 6 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-27/cc-index.paths.gz</url>
 7 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-21/cc-index.paths.gz</url>
 8 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz</url>
 9 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-49/cc-index.paths.gz</url>
10 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-43/cc-index.paths.gz</url>
11 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-39/cc-index.paths.gz</url>
12 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-31/cc-index.paths.gz</url>
13 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-25/cc-index.paths.gz</url>
14 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-21/cc-index.paths.gz</url>
15 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-17/cc-index.paths.gz</url>
16 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/cc-index.paths.gz</url>
17 |         <url>https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/cc-index.paths.gz</url>
18 |       </indexPathsUrls>
19 |     </params>
20 |   </pipesIterator>
21 |   <fetchers>
22 |     <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher">
23 |       <params>
24 |         <name>fs1</name>
25 |         <bucket>commoncrawl</bucket>
26 |         <credentialsProvider>profile</credentialsProvider>
27 |         <profile>saml-pub</profile>
28 |         <region>us-east-1</region>
29 |       </params>
30 |     </fetcher>
31 |   </fetchers>
32 | </properties>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-refetch-fs.xml:
--------------------------------------------------------------------------------
 1 | <properties>
 2 |   <fetchers>
 3 |     <fetcher class="org.apache.tika.pipes.fetcher.http.HttpFetcher">
 4 |       <params>
 5 |         <name>hf</name>
 6 |         <maxRedirects>10</maxRedirects>
 7 |         <!-- 10gb roughly :D -->
 8 |         <maxSpoolSize>10000000000</maxSpoolSize>
 9 |         <overallTimeout>300000</overallTimeout>
10 |       </params>
11 |     </fetcher>
12 |   </fetchers>
13 |   <emitters>
14 |     <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
15 |       <params>
16 |         <name>fse</name>
17 |         <basePath>/Users/allison/data/cc/docs/CC-MAIN-2022-27</basePath>
18 |         <onExists>skip</onExists>
19 |       </params>
20 |     </emitter>
21 |   </emitters>
22 | </properties>


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/mime-filters-av.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "status": [200,300,400],
 3 |   "regex" : {
 4 |     "mimes": [
 5 |       "\\Aaudio",
 6 |       "\\Avideo"
 7 |     ],
 8 |     "detected_mimes": [
 9 |       "\\Aaudio",
10 |       "\\Avideo"
11 |     ]
12 |   }
13 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/mime-filters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "status": [200,300,400],
 3 |   "exact" : {
 4 |     "mimes": [
 5 |       "application/pdf"
 6 |     ],
 7 |     "detected_mimes": [
 8 |       "application/pdf"
 9 |     ],
10 |     "case_sensitive" : false
11 |   },
12 |   "regex" : {
13 |     "mimes": [
14 |       "(?i)pdf\\Z"
15 |     ],
16 |     "detected_mimes": [
17 |       "(?i)pdf\\Z"
18 |     ]
19 |   }
20 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter-sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "exact" : {
 3 |     "mimes": [
 4 |       {"pattern": "application/pdf", "probability": 0.1}
 5 |     ],
 6 |     "detected_mimes": [
 7 |       {"pattern": "application/pdf", "probability": 0.1}
 8 |     ],
 9 |     "case_sensitive" : false
10 |   },
11 |   "status": 200
12 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "exact" : {
 3 |     "mimes": [
 4 |       "application/pdf"
 5 |     ],
 6 |     "detected_mimes": [
 7 |       "application/pdf"
 8 |     ],
 9 |     "case_sensitive" : false
10 |   },
11 |   "defaultInclude": false
12 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/status-filter.json:
--------------------------------------------------------------------------------
1 | {
2 |   "status": 200
3 | }


--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/status-sample-filter.json:
--------------------------------------------------------------------------------
1 | {
2 |   "status": 200,
3 |   "probability": 0.001
4 | }


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/CompositeFeatureMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest;
 2 | 
 3 | import org.apache.tika.config.ServiceLoader;
 4 | import org.apache.tika.pipes.fetcher.Fetcher;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.List;
11 | import java.util.Map;
12 | 
13 | public class CompositeFeatureMapper implements FeatureMapper {
14 |     private static final ServiceLoader DEFAULT_LOADER =
15 |             new ServiceLoader(FeatureMapper.class.getClassLoader());
16 | 
17 |     List<FeatureMapper> mappers;
18 | 
19 |     public CompositeFeatureMapper() {
20 |         this(DEFAULT_LOADER.loadServiceProviders(FeatureMapper.class));
21 |     }
22 | 
23 |     public CompositeFeatureMapper(List<FeatureMapper> mappers) {
24 |         this.mappers = mappers;
25 |     }
26 | 
27 |     @Override
28 |     public void addFeatures(Map<String, String> row, Fetcher fetcher,
29 |                             StoredDocument storedDocument) throws SQLException {
30 |         for (FeatureMapper mapper : mappers) {
31 |             mapper.addFeatures(row, fetcher, storedDocument);
32 |         }
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/FeatureMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.quaerite.core.StoredDocument;
 5 | 
 6 | import java.nio.file.Path;
 7 | import java.sql.ResultSet;
 8 | import java.sql.SQLException;
 9 | import java.util.Map;
10 | 
11 | public interface FeatureMapper {
12 | 
13 |     public static final String REL_PATH_KEY = "relpath";
14 |     public static final String ID_KEY = "id";
15 |     void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException;
16 | }
17 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/CPUMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.ingest.FeatureMapper;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.Map;
11 | 
12 | /**
13 |  *
14 |  */
15 | public class CPUMapper implements FeatureMapper {
16 | 
17 |     @Override
18 |     public void addFeatures(Map<String, String> row, Fetcher fetcher,
19 |                             StoredDocument storedDocument) throws SQLException {
20 |         String val = row.get("cpu_warn");
21 |         storedDocument.addNonBlankField("cpu_warn", val);
22 | 
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/CaradocMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.ingest.FeatureMapper;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.Map;
11 | 
12 | public class CaradocMapper implements FeatureMapper {
13 | 
14 |     @Override
15 |     public void addFeatures(Map<String, String> row, Fetcher fetcher,
16 |                             StoredDocument storedDocument) throws SQLException {
17 |         String val = row.get("cd");
18 |         storedDocument.addNonBlankField("cd", val);
19 |         val = row.get("cd_warn");
20 |         storedDocument.addNonBlankField("cd_warn", val);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ClamAVMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | 
 4 | import org.apache.tika.pipes.fetcher.Fetcher;
 5 | import org.tallison.ingest.FeatureMapper;
 6 | import org.tallison.quaerite.core.StoredDocument;
 7 | 
 8 | import java.nio.file.Path;
 9 | import java.sql.ResultSet;
10 | import java.sql.SQLException;
11 | import java.util.Map;
12 | 
13 | public class ClamAVMapper implements FeatureMapper {
14 | 
15 |     @Override
16 |     public void addFeatures(Map<String, String> row, Fetcher fetcher,
17 |                             StoredDocument storedDocument) throws SQLException {
18 |         String val = row.get("clamav");
19 |         storedDocument.addNonBlankField("clamav", val);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ESUtil.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | public class ESUtil {
 4 |     public static String stripIllegalUnicode(String s) {
 5 |         if (s == null) {
 6 |             return "";
 7 |         }
 8 |         return s.replaceAll("\u0000", "u0000")
 9 |                 .replaceAll("\u001f", "u001f")
10 |                 .replaceAll("\u001e", "u001e")
11 |                 ;
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/MutoolMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.ingest.FeatureMapper;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * this should cover both mutool clean -s and mutool text
14 |  * we aren't currently indexing text as extrated by mutool text
15 |  */
16 | public class MutoolMapper implements FeatureMapper {
17 | 
18 |     @Override
19 |     public void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
20 |         String val = row.get("mc_warn");
21 |         storedDocument.addNonBlankField("mc_warn", val);
22 |         val = row.get("mt_warn");
23 |         storedDocument.addNonBlankField("mt_warn", val);
24 | 
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/PDFMinerMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.ingest.FeatureMapper;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * this should cover both pdfminer dump and pdfminer text
14 |  * we aren't currently indexing anything but the warning msgs
15 |  */
16 | public class PDFMinerMapper implements FeatureMapper {
17 | 
18 |     @Override
19 |     public void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
20 |         String val = row.get("pmd_warn");
21 |         storedDocument.addNonBlankField("pmd_warn", val);
22 |         val = row.get("pmt_warn");
23 |         storedDocument.addNonBlankField("pmt_warn", val);
24 | 
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/PDFResurrectMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import java.sql.SQLException;
 4 | import java.util.Map;
 5 | import java.util.regex.Matcher;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | import org.apache.tika.pipes.fetcher.Fetcher;
 9 | import org.tallison.ingest.FeatureMapper;
10 | import org.tallison.quaerite.core.StoredDocument;
11 | 
12 | public class PDFResurrectMapper  implements FeatureMapper {
13 |     @Override
14 |     public void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument)
15 |             throws SQLException {
16 |         String stdout = row.get("pr");
17 |         if (stdout == null) {
18 |             return;
19 |         }
20 |         Matcher m = Pattern.compile(": (\\d+)").matcher(stdout);
21 |         if (m.find()) {
22 |             storedDocument.addNonBlankField("pr_updates", m.group(1));
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ProfileFeatureMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.pipes.fetcher.Fetcher;
 4 | import org.tallison.ingest.FeatureMapper;
 5 | import org.tallison.quaerite.core.StoredDocument;
 6 | 
 7 | import java.nio.file.Path;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | import java.util.ArrayList;
11 | import java.util.Collections;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 | 
18 | import static org.tallison.ingest.mappers.QPDFFeatureMapper.joinWith;
19 | 
20 | public class ProfileFeatureMapper implements FeatureMapper {
21 |     @Override
22 |     public void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
23 | 
24 |         storedDocument.addNonBlankField("fname", row.get("fname"));
25 |         storedDocument.addNonBlankField("original_fname", row.get("fname"));
26 |         storedDocument.addNonBlankField("shasum_256", row.get("shasum_256"));
27 |         storedDocument.addNonBlankField("size", row.get("size"));
28 |         storedDocument.addNonBlankField("collection", row.get("collection"));
29 |         //these are all commoncrawl/web crawl specific... factor into another mapper?
30 |         storedDocument.addNonBlankField("host_location", row.get("host_location"));
31 |         storedDocument.addNonBlankField("country", row.get("country"));
32 |         storedDocument.addNonBlankField("tld", row.get("tld"));
33 |         storedDocument.addNonBlankField("detected_mime", row.get("detected_mime"));
34 |         storedDocument.addNonBlankField("url", row.get("url"));
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/UniverseMapper.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import java.sql.SQLException;
 4 | import java.util.Map;
 5 | 
 6 | import org.tallison.ingest.FeatureMapper;
 7 | import org.tallison.quaerite.core.StoredDocument;
 8 | 
 9 | import org.apache.tika.pipes.fetcher.Fetcher;
10 | 
11 | public class UniverseMapper implements FeatureMapper {
12 |     @Override
13 |     public void addFeatures(Map<String, String> row, Fetcher fetcher, StoredDocument storedDocument)
14 |             throws SQLException {
15 |         storedDocument.addNonBlankField("universe", row.get("universe"));
16 |         storedDocument.addNonBlankField("universe_validity",
17 |                 row.get("universe_validity"));
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/qpdf/QPDFResults.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.qpdf;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | public class QPDFResults {
 7 | 
 8 |     public Set<String> keys = new HashSet<>();
 9 |     public Set<String> parentAndKeys = new HashSet<>();
10 |     public Set<String> typeKeys = new HashSet<>();
11 |     public Set<String> keyValues = new HashSet<>();
12 |     public Set<String> filters = new HashSet<>();
13 |     public int maxFilterCount = 0;
14 | 
15 |     @Override
16 |     public String toString() {
17 |         return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys +
18 |                 ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters +
19 |                 ", maxFilterCount=" + maxFilterCount + '}';
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/qpdf10/qpdf/QPDFResults.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.qpdf10.qpdf;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | public class QPDFResults {
 7 | 
 8 |     public Set<String> keys = new HashSet<>();
 9 |     public Set<String> parentAndKeys = new HashSet<>();
10 |     public Set<String> typeKeys = new HashSet<>();
11 |     public Set<String> keyValues = new HashSet<>();
12 |     public Set<String> filters = new HashSet<>();
13 |     public int maxFilterCount = 0;
14 | 
15 |     @Override
16 |     public String toString() {
17 |         return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys +
18 |                 ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters +
19 |                 ", maxFilterCount=" + maxFilterCount + '}';
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/ingest/src/main/resources/META-INF/services/org.tallison.ingest.FeatureMapper:
--------------------------------------------------------------------------------
 1 | #org.tallison.ingest.mappers.ArlingtonMapper
 2 | #org.tallison.ingest.mappers.CaradocMapper
 3 | #org.tallison.ingest.mappers.ClamAVMapper
 4 | #org.tallison.ingest.mappers.CPUMapper
 5 | #org.tallison.ingest.mappers.MutoolMapper
 6 | #org.tallison.ingest.mappers.PDFBytesMapper
 7 | #org.tallison.ingest.mappers.PDFCheckerMapper
 8 | org.tallison.ingest.mappers.PDFInfoFeatureMapper
 9 | #org.tallison.ingest.mappers.PDFMinerMapper
10 | org.tallison.ingest.mappers.ProfileFeatureMapper
11 | org.tallison.ingest.mappers.QPDFFeatureMapper
12 | org.tallison.ingest.mappers.StatusFeatureMapper
13 | #org.tallison.ingest.mappers.TikaFeatureMapper
14 | #org.tallison.ingest.mappers.MultiCompareMapper
15 | #org.tallison.ingest.mappers.PDFResurrectMapper
16 | #org.tallison.ingest.mappers.PDFFontsMapper
17 | #org.tallison.ingest.mappers.XPDFFontsMapper
18 | #org.tallison.ingest.mappers.UniverseMapper


--------------------------------------------------------------------------------
/ingest/src/main/resources/important-int-keys.txt:
--------------------------------------------------------------------------------
 1 | /BitsPerComponent
 2 | /BitsPerCoordinate
 3 | /BitsPerSample
 4 | /ca
 5 | /CA
 6 | /Colors
 7 | /ColorTransform
 8 | /Count
 9 | /Descent
10 | /EarlyChange
11 | /F
12 | /Ff
13 | /FL
14 | /FontWeight
15 | /FormType
16 | /FunctionType
17 | /Gamma
18 | /HalftoneType
19 | /I
20 | /LC
21 | /Length
22 | /LJ
23 | /LW
24 | /M
25 | /ML
26 | /N
27 | /O
28 | /OPM
29 | /Order
30 | /P
31 | /PaintType
32 | /PatternType
33 | /Penalty
34 | /Position
35 | /Predictor
36 | /Q
37 | /R
38 | /Rotate
39 | /RT
40 | /S
41 | /ShadingType
42 | /SM
43 | /SMaskInData
44 | /St
45 | /TilingType
46 | /TP
47 | /UserUnit
48 | /V
49 | /Version
50 | /VerticesPerRow
51 | /Volume
52 | /W
53 | /WMode


--------------------------------------------------------------------------------
/ingest/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %t %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/ingest/src/main/resources/selectStar-minimal.sql:
--------------------------------------------------------------------------------
 1 | select u.id,
 2 | u.url as url,
 3 | 's3://safedocs-cc-202109/'||p.path as fname,
 4 | p.path as relpath,
 5 | fetched_digest as shasum_256,
 6 | 'CC-MAIN-2021-31' as collection,
 7 | fetched_length as size,
 8 | case
 9 | 	when latitude is null
10 | 	then ''
11 | 	else latitude||','||longitude
12 | end as host_location,
13 | h.tld, h.country,
14 | pinfo.stderr pinfo_stderr,
15 | pinfo.stdout pinfo_stdout,
16 | pinfo.exit_value pinfo_exit,
17 | case
18 |     when pinfo.stderr like 'Command Line Error: Incorrect password%' then 'encrypted'
19 |     when pinfo.path is null then 'missing'
20 | 	when pinfo.timeout=true then 'timeout'
21 | 	when pinfo.exit_value <> 0 then 'crash'
22 | 	when length(pinfo.stderr) > 5 then 'warn'
23 | 	else 'success'
24 | end as pinfo_status,
25 | q.stderr q_stderr,
26 | q.exit_value q_exit,
27 | case
28 |     when q.path is null then 'missing'
29 | 	when q.timeout=true then 'timeout'
30 | 	when q.exit_value <> 0 then 'crash'
31 | 	when length(q.stderr) > 5 then 'warn'
32 | 	else 'success'
33 | end as q_status
34 | from profiles p
35 | join cc_fetch f on p.path = f.path
36 | join cc_fetch_status s on f.status_id=s.id
37 | join cc_urls u on f.id=u.id
38 | join cc_hosts h on u.host=h.id
39 | join pdfinfo pinfo on pinfo.path=p.path
40 | join qpdf q on q.path = p.path
41 | order by u.id


--------------------------------------------------------------------------------
/ingest/src/main/resources/selectStar-sample.sql:
--------------------------------------------------------------------------------
 1 | select u.id as id,
 2 | 'CC-MAIN-2021-31-sample' as collection,
 3 | case
 4 |     when m.name is null or length(m.name) = 0
 5 |         then 'UNKNOWN'
 6 |     else m.name
 7 | end as detected_mime,
 8 | case
 9 |    when latitude is not null
10 | 	then latitude||','||longitude
11 | 	else ''
12 | end as host_location,
13 | h.tld,
14 | case
15 | 	when h.country is not null
16 | 	then h.country
17 | 	else 'UNKNOWN'
18 | end as country
19 | from sample.cc_urls u
20 | join sample.cc_hosts h on u.host=h.id
21 | join sample.cc_detected_mimes m on u.detected_mime=m.id
22 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/ArlingtonMapperTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.junit.Test;
 4 | import org.tallison.quaerite.core.StoredDocument;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | import static org.junit.Assert.assertTrue;
10 | 
11 | public class ArlingtonMapperTest extends MapperTest {
12 | 
13 |     @Test
14 |     public void testBasic() throws Exception {
15 |         ArlingtonMapper mapper = new ArlingtonMapper();
16 |         StoredDocument sd = new StoredDocument("");
17 |         mapper._processFile(getPath("arlington/GHOSTSCRIPT-687647-0.pdf.txt"), sd);
18 |         assertEquals("Can't select any link", sd.getFields().get("a_warn"));
19 |     }
20 | 
21 |     @Test
22 |     public void testFailedToOpen() throws Exception {
23 |         ArlingtonMapper mapper = new ArlingtonMapper();
24 |         StoredDocument sd = new StoredDocument("");
25 |         mapper._processFile(getPath("arlington/GHOSTSCRIPT-688076-1.pdf.txt"), sd);
26 |         assertEquals("fail", sd.getFields().get("a_status"));
27 |     }
28 | 
29 |     @Test
30 |     public void testDiffContexts() throws Exception {
31 |         //GHOSTSCRIPT-687499-0.pdf.txt
32 |         ArlingtonMapper mapper = new ArlingtonMapper();
33 |         StoredDocument sd = new StoredDocument("");
34 |         mapper._processFile(getPath("arlington/GHOSTSCRIPT-687499-0.pdf.txt"), sd);
35 |         boolean success = false;
36 |         for (String s : (List<String>)sd.getFields().get("a_warn")) {
37 |             if (s.equals("object validated in two different contexts")) {
38 |                 success = true;
39 |             }
40 |         }
41 |         assertTrue(success);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/MapperTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.net.URISyntaxException;
 6 | 
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.time.Instant;
10 | import java.time.LocalDateTime;
11 | import java.time.ZoneId;
12 | import java.time.format.DateTimeFormatter;
13 | import java.util.Locale;
14 | 
15 | import org.junit.Test;
16 | 
17 | public class MapperTest {
18 | 
19 |     InputStream getPath(String relPath) throws IOException {
20 |         try {
21 |             String path = "/test-documents/"+relPath;
22 |             return Files.newInputStream(Paths.get(this.getClass().getResource(path).toURI()));
23 |         } catch (URISyntaxException e) {
24 |             throw new IOException(e);
25 |         }
26 |     }
27 | 
28 |     @Test
29 |     public void testDateParsing() throws Exception {
30 |         String v = "Mon Apr  1 22:12:30 2013 UTC";
31 |         v = v.replaceAll("\\s+", " ").trim();
32 |         Instant instant = LocalDateTime.parse(v,
33 |                         DateTimeFormatter.ofPattern( "EEE MMM d HH:mm:ss yyyy z",
34 |                                 Locale.US )
35 |                 )
36 |                 .atZone(ZoneId.of("UTC")).toInstant();
37 |         System.out.println(instant);
38 | 
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/PDFCheckerMapperTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import org.apache.tika.io.TikaInputStream;
 4 | import org.junit.Test;
 5 | import org.tallison.ingest.mappers.PDFCheckerMapper;
 6 | import org.tallison.quaerite.core.StoredDocument;
 7 | 
 8 | import java.io.InputStream;
 9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | 
12 | import static org.junit.Assert.assertTrue;
13 | 
14 | public class PDFCheckerMapperTest {
15 | 
16 |     @Test
17 |     public void testBasic() throws Exception {
18 |         PDFCheckerMapper mapper = new PDFCheckerMapper();
19 |         Path p = Paths.get(
20 |                 PDFCheckerMapperTest.class.getResource(
21 |                         "/test-documents/pdfchecker/GHOSTSCRIPT-696838-0.zip-0.pdf.json").toURI());
22 |         StoredDocument sd = new StoredDocument("id");
23 |         try (InputStream is = TikaInputStream.get(p)) {
24 |             mapper.processJson(is, sd);
25 |         }
26 |         String summaryInfo = sd.getFields().get("pc_summary_info").toString();
27 |         assertTrue(summaryInfo.contains("can-be-optimized"));
28 |         assertTrue(summaryInfo.contains("born-digital"));
29 |     }
30 | 
31 |     @Test
32 |     public void testFonts() throws Exception {
33 |         PDFCheckerMapper mapper = new PDFCheckerMapper();
34 |         Path p = Paths.get(
35 |                 PDFCheckerMapperTest.class.getResource(
36 |                         "/test-documents/pdfchecker/fonts-PDFBOX-1002-2.pdf.json").toURI());
37 |         StoredDocument sd = new StoredDocument("id");
38 |         try (InputStream is = TikaInputStream.get(p)) {
39 |             mapper.processJson(is, sd);
40 |         }
41 |         String summaryInfo = sd.getFields().get("pc_summary_info").toString();
42 |         assertTrue(summaryInfo.contains("can-be-optimized"));
43 |         assertTrue(summaryInfo.contains("born-digital"));
44 |     }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/PDFFontsMapperTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | 
 4 | import java.nio.charset.StandardCharsets;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.commons.io.IOUtils;
 9 | import org.junit.Test;
10 | import org.tallison.quaerite.core.StoredDocument;
11 | 
12 | public class PDFFontsMapperTest extends MapperTest {
13 | 
14 |     @Test
15 |     public void testBasic() throws Exception {
16 |         String stdout = IOUtils.toString(
17 |                 getPath("pdffonts/test-basic.txt"), StandardCharsets.UTF_8);
18 | 
19 |         PDFFontsMapper mapper = new PDFFontsMapper();
20 |         StoredDocument sd = new StoredDocument("id");
21 |         Map<String, String> row = new HashMap<>();
22 |         row.put("pdffonts_stdout", stdout);
23 |         mapper.addFeatures(row, null, sd);
24 |         System.out.println(sd);
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/QPDFJsonExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | import static org.junit.Assert.assertTrue;
 4 | 
 5 | import java.io.IOException;
 6 | import java.io.Reader;
 7 | import java.net.URISyntaxException;
 8 | import java.nio.charset.StandardCharsets;
 9 | import java.nio.file.Files;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 | 
13 | import org.junit.Test;
14 | import org.tallison.ingest.qpdf.QPDFJsonExtractor;
15 | import org.tallison.ingest.qpdf.QPDFResults;
16 | 
17 | //these are tests for qpdf 11.x json v2
18 | public class QPDFJsonExtractorTest {
19 | 
20 |     @Test
21 |     public void testBasic() throws Exception {
22 |         try (Reader reader = getReader("/qpdfv11/qpdf.json")) {
23 |             QPDFJsonExtractor ex = new QPDFJsonExtractor();
24 |             QPDFResults results = ex.extract("id", reader);
25 |             System.out.println(results);
26 |             assertTrue(results.keyValues.contains("/Creator->Microsoft® Office Word 2007"));
27 |             assertTrue(results.keyValues.contains(("/CreationDate->DATE")));
28 |         }
29 |     }
30 | 
31 |     private Reader getReader(String file) throws IOException {
32 |         return Files.newBufferedReader(getPath(file), StandardCharsets.UTF_8);
33 |     }
34 | 
35 |     private Path getPath(String file) throws IOException {
36 |         try {
37 |             return Paths.get(this.getClass().getResource("/test-documents/"+file).toURI());
38 |         } catch (URISyntaxException e) {
39 |             throw new IOException(e);
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/XPDFFontsMapperTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.ingest.mappers;
 2 | 
 3 | 
 4 | import java.nio.charset.StandardCharsets;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.commons.io.IOUtils;
 9 | import org.junit.Test;
10 | import org.tallison.quaerite.core.StoredDocument;
11 | 
12 | public class XPDFFontsMapperTest extends MapperTest {
13 | 
14 |     @Test
15 |     public void testBasic() throws Exception {
16 |         String stdout = IOUtils.toString(
17 |                 getPath("xpdffonts/test-basic.txt"), StandardCharsets.UTF_8);
18 | 
19 |         XPDFFontsMapper mapper = new XPDFFontsMapper();
20 |         StoredDocument sd = new StoredDocument("id");
21 |         Map<String, String> row = new HashMap<>();
22 |         row.put("xpdffonts_stdout", stdout);
23 |         mapper.addFeatures(row, null, sd);
24 |         System.out.println(sd);
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-687647-0.pdf.txt:
--------------------------------------------------------------------------------
 1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-687647-0.pdf" - PDFix v6.1.0
 2 | Trailer
 3 |   Trailer->Root
 4 |   Trailer->Info
 5 |     Trailer->Root->Pages
 6 |     Trailer->Root->Outlines
 7 |       Trailer->Root->Pages->Kids
 8 | Error: Can't select any link from [fn:SinceVersion(1.0,PageTreeNode),fn:SinceVersion(1.0,PageObject)] to validate provided object: [0] for object 4
 9 | END
10 | 


--------------------------------------------------------------------------------
/ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-688076-1.pdf.txt:
--------------------------------------------------------------------------------
1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix v6.1.0
2 | Error: Failed to open: "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix GetError(): Failed to open document.
3 | END
4 | 


--------------------------------------------------------------------------------
/simple-ingester/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>file-observatory</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>simple-ingester</artifactId>
13 | 
14 |   <properties>
15 |     <maven.compiler.source>11</maven.compiler.source>
16 |     <maven.compiler.target>11</maven.compiler.target>
17 |   </properties>
18 | 
19 |   <dependencies>
20 |     <dependency>
21 |       <groupId>org.apache.tika</groupId>
22 |       <artifactId>tika-core</artifactId>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.tika</groupId>
26 |       <artifactId>tika-serialization</artifactId>
27 |       <version>${tika.version}</version>
28 |     </dependency>
29 |     <dependency>
30 |       <groupId>org.apache.httpcomponents</groupId>
31 |       <artifactId>httpclient</artifactId>
32 |       <version>4.5.13</version>
33 |     </dependency>
34 |   </dependencies>
35 | 
36 | </project>


--------------------------------------------------------------------------------
/simple-ingester/src/main/java/org/tallison/ingester/IngesterCLI.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingester;
2 | 
3 | public class IngesterCLI {
4 | 
5 |     public static void main(String[] args) {
6 | 
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/simple-ingester/src/main/java/org/tallison/tika/parser/ConcatenatingParser.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.tika.parser;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.tika.metadata.Metadata;
 7 | import org.apache.tika.pipes.FetchEmitTuple;
 8 | 
 9 | public class ConcatenatingParser {
10 | 
11 |     private List<TikaServerClient> parsers = new ArrayList<>();
12 | 
13 |     public List<Metadata> parse(FetchEmitTuple tuple) {
14 |         List<Metadata> results = new ArrayList<>();
15 | 
16 |         return results;
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/tika-addons/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>file-observatory</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-addons</artifactId>
13 |   <packaging>pom</packaging>
14 |   <modules>
15 |     <module>tika-pipes-reporter</module>
16 |     <module>tika-eval-multicomparer</module>
17 |     <module>tika-server-fuzzer</module>
18 |   </modules>
19 | 
20 |   <properties>
21 |     <maven.compiler.source>11</maven.compiler.source>
22 |     <maven.compiler.target>11</maven.compiler.target>
23 |   </properties>
24 | 
25 | </project>


--------------------------------------------------------------------------------
/tika-addons/tika-eval-multicomparer/src/main/java/org/tallison/tika/eval/multi/ListGenerator.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.tika.eval.multi;
 2 | 
 3 | import java.io.File;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | public class ListGenerator {
 8 | 
 9 |     public static void main(String[] args) throws Exception {
10 |         Set<String> seen = new HashSet<>();
11 |         File tools = new File(".../data/extracts");
12 |         for (File tool : tools.listFiles()) {
13 |             for (File c : tool.listFiles()) {
14 |                 for (File e : c.listFiles()) {
15 |                     String n = e.getName().replaceAll(".json", "").replaceAll(".txt", "");
16 |                     if (! n.startsWith("._")) {
17 |                         seen.add(n);
18 |                     }
19 |                 }
20 |             }
21 |         }
22 |         for (String n : seen) {
23 |             System.out.println(n);
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/tika-addons/tika-pipes-reporter/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>tika-addons</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-pipes-reporter</artifactId>
13 | 
14 |   <properties>
15 |     <maven.compiler.source>11</maven.compiler.source>
16 |     <maven.compiler.target>11</maven.compiler.target>
17 |   </properties>
18 | 
19 |   <dependencies>
20 |     <dependency>
21 |       <groupId>org.postgresql</groupId>
22 |       <artifactId>postgresql</artifactId>
23 |     </dependency>
24 |     <dependency>
25 |       <groupId>org.apache.tika</groupId>
26 |       <artifactId>tika-core</artifactId>
27 |       <scope>provided</scope>
28 |     </dependency>
29 |   </dependencies>
30 | 
31 |   <build>
32 |     <plugins>
33 |       <plugin>
34 |         <artifactId>maven-shade-plugin</artifactId>
35 |         <version>${maven.shade.version}</version>
36 |         <executions>
37 |           <execution>
38 |             <phase>package</phase>
39 |             <goals>
40 |               <goal>shade</goal>
41 |             </goals>
42 |             <configuration>
43 |               <createDependencyReducedPom>
44 |                 false
45 |               </createDependencyReducedPom>
46 |               <filters>
47 |                 <filter>
48 |                   <artifact>*:*</artifact>
49 |                 </filter>
50 |               </filters>
51 |               <transformers>
52 |               </transformers>
53 |             </configuration>
54 |           </execution>
55 |         </executions>
56 |       </plugin>
57 |     </plugins>
58 |   </build>
59 | 
60 | </project>


--------------------------------------------------------------------------------
/tika-addons/tika-server-fuzzer/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>tika-addons</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-server-fuzzer</artifactId>
13 | 
14 |   <properties>
15 |     <maven.compiler.source>14</maven.compiler.source>
16 |     <maven.compiler.target>14</maven.compiler.target>
17 |   </properties>
18 | 
19 |   <dependencies>
20 |     <dependency>
21 |       <groupId>org.apache.tika</groupId>
22 |       <artifactId>tika-core</artifactId>
23 |       <version>${tika.version}</version>
24 |     </dependency>
25 |     <dependency>
26 |       <groupId>org.apache.tika</groupId>
27 |       <artifactId>tika-fuzzing</artifactId>
28 |       <version>${tika.version}</version>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.tika</groupId>
32 |       <artifactId>tika-serialization</artifactId>
33 |       <version>${tika.version}</version>
34 |     </dependency>
35 |     <dependency>
36 |       <groupId>org.apache.cxf</groupId>
37 |       <artifactId>cxf-rt-rs-client</artifactId>
38 |       <version>${cxf.version}</version>
39 |     </dependency>
40 |   </dependencies>
41 | 
42 | </project>


--------------------------------------------------------------------------------
/tika-containers/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>file-observatory</artifactId>
 7 |         <groupId>org.tallison</groupId>
 8 |         <version>1.0.0-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>tika-containers</artifactId>
13 |     <packaging>pom</packaging>
14 |     <modules>
15 |         <module>tika-pdftotext</module>
16 |         <module>tika-pdfchecker</module>
17 |         <module>tika-pdfspelunker</module>
18 |         <module>tika-pdfjs</module>
19 |       <module>tika-arlington</module>
20 |         <module>tika-pipes-pdfinfo</module>
21 |       <module>tika-pipes-siegfried</module>
22 |     </modules>
23 | 
24 | 
25 | </project>


--------------------------------------------------------------------------------
/tika-containers/tika-arlington/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER
 2 | 
 3 | RUN apt-get update && apt-get install curl g++-8 gcc-8 cmake git -y
 4 | 
 5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
 6 | 
 7 | RUN git clone  https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \
 8 |     cd /arlington-pdf-model && git checkout fab5b58
 9 | 
10 | RUN cd /arlington-pdf-model/TestGrammar && \
11 |     cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \
12 |     cmake --build cmake-linux/debug --config Debug
13 | 
14 | RUN mkdir /tika-bin && cd /tika-bin && \
15 |     curl https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/2.4.1/tika-server-core-2.4.1.jar --output tika-server-core.jar
16 | 
17 | 
18 | FROM amd64/openjdk:11.0.8-slim-buster
19 | 
20 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin
21 | 
22 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest
23 | 
24 | RUN mkdir /tika-bin
25 | COPY --from=GRAMMAR_CHECKER_BUILDER /tika-bin/tika-server-core.jar /tika-bin/tika-server-core.jar
26 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
27 | 
28 | #once we upgrade to > tika 2.4.1, we can get rid of this custom regex parser
29 | COPY target/tika-arlington-1.0.0-SNAPSHOT.jar /tika-bin/tika-arlington.jar
30 | 
31 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
32 | 


--------------------------------------------------------------------------------
/tika-containers/tika-arlington/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xmlns="http://maven.apache.org/POM/4.0.0"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>tika-containers</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-arlington</artifactId>
13 | 
14 |   <dependencies>
15 |     <dependency>
16 |       <groupId>org.apache.tika</groupId>
17 |       <artifactId>tika-core</artifactId>
18 |       <scope>provided</scope>
19 |     </dependency>
20 |   </dependencies>
21 |   <build>
22 |     <plugins>
23 |       <plugin>
24 |         <groupId>org.apache.maven.plugins</groupId>
25 |         <artifactId>maven-shade-plugin</artifactId>
26 |         <version>${maven.shade.version}</version>
27 |         <executions>
28 |           <execution>
29 |             <phase>package</phase>
30 |             <goals>
31 |               <goal>shade</goal>
32 |             </goals>
33 |             <configuration>
34 |               <createDependencyReducedPom>
35 |                 false
36 |               </createDependencyReducedPom>
37 |               <filters>
38 |                 <filter>
39 |                   <artifact>*:*</artifact>
40 |                 </filter>
41 |               </filters>
42 |               <transformers>
43 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
44 |                   <mainClass>org.tallison.observatory.RegexCaptureParser</mainClass>
45 |                 </transformer>
46 |               </transformers>
47 |             </configuration>
48 |           </execution>
49 |         </executions>
50 |       </plugin>
51 |     </plugins>
52 |   </build>
53 | </project>


--------------------------------------------------------------------------------
/tika-containers/tika-exiftool/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from: https://github.com/Miljar/exiftool-docker/blob/master/Dockerfile
 2 | FROM amd64/openjdk:11.0.8-slim-buster
 3 | ENV EXIFTOOL_VERSION=12.38
 4 | ENV TIKA_VERSION=2.2.1
 5 | 
 6 | RUN apk add --no-cache perl make
 7 | RUN cd /tmp \
 8 | 	&& wget http://www.sno.phy.queensu.ca/~phil/exiftool/Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \
 9 | 	&& tar -zxvf Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \
10 | 	&& cd Image-ExifTool-${EXIFTOOL_VERSION} \
11 | 	&& perl Makefile.PL \
12 | 	&& make test \
13 | 	&& make install \
14 | 	&& cd .. \
15 | 	&& rm -rf Image-ExifTool-${EXIFTOOL_VERSION}
16 | 
17 | RUN mkdir /tika-bin \
18 |     && cd /tika-bin \
19 |     && wget https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/${TIKA_VERSION}/tika-server-core-{$TIKA_VERSION}.jar
20 | 
21 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
22 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
23 | 
24 | #e.g.
25 | #docker run -d -p 9998:9998


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER
 5 | RUN mkdir /pdfchecker-bin
 6 | 
 7 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz
 8 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz
 9 | 
10 | RUN mkdir /tika-bin
11 | COPY target/tika-pdfchecker-1.0.0-SNAPSHOT.jar /tika-bin/tika-pdfchecker-1.0.0-SNAPSHOT.jar
12 | 
13 | #find a more elegant way of grabbing this after we release it
14 | COPY tika-server-core-2.0.0-SNAPSHOT.jar /tika-bin/tika-server-core-2.0.0-SNAPSHOT.jar
15 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
16 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
17 | 
18 | #e.g.
19 | #docker run -d -p 9998:9998


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/my-tika-config.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |   http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <properties>
19 |     <parsers>
20 |         <parser class="org.apache.tika.parser.pdfchecker.PDFChecker">
21 |             <params>
22 |                 <param name="timeoutMillis" type="long">120000</param>
23 |             </params>
24 |         </parser>
25 |     </parsers>
26 |     <server>
27 |         <port>9998</port>
28 |         <taskTimeoutMillis>180000</taskTimeoutMillis>
29 |         <enableUnsecureFeatures>false</enableUnsecureFeatures>
30 |         <maxFiles>10000000</maxFiles>
31 |         <forkedJVMArgs>
32 |             <arg>-Xmx2g</arg>
33 |         </forkedJVMArgs>
34 |         <endpoints>
35 |             <endpoint>rmeta</endpoint>
36 |         </endpoints>
37 |     </server>
38 | </properties>
39 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
 1 | #  Licensed to the Apache Software Foundation (ASF) under one or more
 2 | #  contributor license agreements.  See the NOTICE file distributed with
 3 | #  this work for additional information regarding copyright ownership.
 4 | #  The ASF licenses this file to You under the Apache License, Version 2.0
 5 | #  (the "License"); you may not use this file except in compliance with
 6 | #  the License.  You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | org.tallison.tika.parsers.pdfchecker.PDFChecker


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/test/java/TikaPDFToTextTest.java:
--------------------------------------------------------------------------------
 1 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 2 | import org.apache.cxf.jaxrs.client.WebClient;
 3 | import org.apache.tika.TikaTest;
 4 | import org.apache.tika.metadata.Metadata;
 5 | import org.apache.tika.metadata.serialization.JsonMetadataList;
 6 | import org.junit.Ignore;
 7 | import org.junit.Test;
 8 | 
 9 | import javax.ws.rs.core.Response;
10 | import java.io.InputStream;
11 | import java.io.InputStreamReader;
12 | import java.io.Reader;
13 | import java.util.List;
14 | 
15 | import static java.nio.charset.StandardCharsets.UTF_8;
16 | import static org.junit.Assert.assertEquals;
17 | 
18 | public class TikaPDFToTextTest extends TikaTest {
19 |     private static String END_POINT = "http://localhost:9998";
20 |     private static final String META_PATH = "/rmeta";
21 | 
22 |     @Test
23 |     @Ignore("once container is running")
24 |     public void testBasic() throws Exception {
25 |         Response response = WebClient
26 |                 .create(END_POINT + META_PATH)
27 |                 .accept("application/json")
28 |                 .acceptEncoding("gzip")
29 |                 .put(ClassLoader.getSystemResourceAsStream("test-documents/testPDF.pdf"));
30 | 
31 |         Reader reader = null;
32 |         String encoding = response.getHeaderString("content-encoding");
33 |         if ("gzip".equals(encoding)) {
34 |             reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
35 |         } else {
36 |             reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
37 |         }
38 |         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
39 |         assertEquals(1, metadataList.size());
40 |         assertEquals("born-digital", metadataList.get(0).get("pc_summary_info"));
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf


--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/tika-containers/tika-pdfium/my-args.gn:
--------------------------------------------------------------------------------
 1 | # Set build arguments here. See `gn help buildargs`.
 2 | 
 3 | # need this to build pdfium_test
 4 | pdf_is_standalone = true
 5 | 
 6 | #other options are commented out below
 7 | #use_goma = true  # Googlers only. Make sure goma is installed and running first.
 8 | #is_debug = true  # Enable debugging features.
 9 | 
10 | # Set true to enable experimental Skia backend.
11 | #pdf_use_skia = false
12 | # Set true to enable experimental Skia backend (paths only).
13 | #pdf_use_skia_paths = false
14 | 
15 | #pdf_enable_xfa = true  # Set false to remove XFA support (implies JS support).
16 | #pdf_enable_v8 = true  # Set false to remove Javascript support.
17 | #is_component_build = false # Disable component build (Though it should work)


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs-selenium/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>tika-containers</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-pdfjs-selenium</artifactId>
13 | 
14 |   <properties>
15 |     <maven.compiler.source>11</maven.compiler.source>
16 |     <maven.compiler.target>11</maven.compiler.target>
17 |     <selenium.version>3.141.59</selenium.version>
18 |   </properties>
19 | 
20 |   <!-- resources
21 |   https://medium.com/dropout-analytics/selenium-and-geckodriver-on-mac-b411dbfe61bc
22 |   -->
23 |   <dependencies>
24 |     <dependency>
25 |       <groupId>org.seleniumhq.selenium</groupId>
26 |       <artifactId>selenium-api</artifactId>
27 |       <version>${selenium.version}</version>
28 |     </dependency>
29 |     <dependency>
30 |       <groupId>org.seleniumhq.selenium</groupId>
31 |       <artifactId>selenium-remote-driver</artifactId>
32 |       <version>${selenium.version}</version>
33 |     </dependency>
34 |     <dependency>
35 |       <groupId>org.seleniumhq.selenium</groupId>
36 |       <artifactId>selenium-server</artifactId>
37 |       <version>${selenium.version}</version>
38 |     </dependency>
39 |   </dependencies>
40 | 
41 | </project>


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs-selenium/src/main/java/FirefoxSeleniumExample.java:
--------------------------------------------------------------------------------
 1 | import org.openqa.selenium.firefox.FirefoxBinary;
 2 | import org.openqa.selenium.firefox.FirefoxDriver;
 3 | import org.openqa.selenium.firefox.FirefoxOptions;
 4 | 
 5 | public class FirefoxSeleniumExample {
 6 |     public static void main(String[] args) {
 7 |         FirefoxBinary firefoxBinary = new FirefoxBinary();
 8 |         firefoxBinary.addCommandLineOptions("--headless");
 9 |         System.setProperty("webdriver.gecko.driver", "/Users/allison/tools/firefox/geckodriver");
10 |         FirefoxOptions firefoxOptions = new FirefoxOptions();
11 |         firefoxOptions.setBinary(firefoxBinary);
12 |         FirefoxDriver driver = new FirefoxDriver(firefoxOptions);
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:16.13.0
 2 | 
 3 | #make sure you have enough memory to build this --memory=900
 4 | RUN npm install -g gulp-cli
 5 | 
 6 | #Option A: grab and build a specific release
 7 | #RUN apt-get update && apt-get -y install wget openjdk-11-jre
 8 | #RUN mkdir /builddir && cd /builddir && \
 9 | #   wget https://github.com/mozilla/pdf.js/archive/refs/tags/v2.11.338.tar.gz && \
10 | #   tar -xzvf v2.11.338.tar.gz && mv pdf.js-2.11.338 pdf.js && \
11 | #   cd pdf.js && npm install && gulp dist-install && \
12 | #   rm /builddir/v2.11.338.tar.gz
13 | 
14 | #Option B: build from main
15 | RUN apt-get update && apt-get -y install git openjdk-11-jre
16 | RUN mkdir /builddir && cd /builddir && \
17 |     git clone https://github.com/mozilla/pdf.js && cd pdf.js && \
18 |     npm install && gulp dist-install
19 | 
20 | COPY js/my-getinfo.js /builddir/pdf.js/examples/node/my-getinfo.js
21 | 
22 | # TODO: figure two stage build and what we can jettison for a smaller container
23 | 
24 | RUN mkdir /tika-bin/
25 | COPY target/tika-pdfjs-1.0.0-SNAPSHOT.jar /tika-bin/
26 | #find a more elegant way of grabbing this after we release it
27 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/
28 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
29 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
30 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-basic.txt:
--------------------------------------------------------------------------------
 1 | # Document Loaded
 2 | Number of Pages: 4
 3 | 
 4 | # Metadata Is Loaded
 5 | ## Info
 6 | {
 7 |   "PDFFormatVersion": "1.5",
 8 |   "Language": "en-US",
 9 |   "EncryptFilterName": null,
10 |   "IsLinearized": false,
11 |   "IsAcroFormPresent": false,
12 |   "IsXFAPresent": false,
13 |   "IsCollectionPresent": false,
14 |   "IsSignaturesPresent": false,
15 |   "Producer": "Microsoft® Word 2016",
16 |   "Creator": "Microsoft® Word 2016",
17 |   "CreationDate": "D:20210421211209+00'00'",
18 |   "ModDate": "D:20210421211209+00'00'"
19 | }
20 | 
21 | # Page 1
22 | Size: 612x792
23 | 
24 | Warning: TT: undefined function: 32
25 | Warning: fetchStandardFontData: failed to fetch file "FoxitSans.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
26 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerif.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
27 | ## Text Content
28 | here is some page 1 content
29 | 
30 | # Page 2
31 | Size: 612x792
32 | 
33 | ## Text Content
34 | some page 2 content
35 | 
36 | # Page 3
37 | Size: 612x792
38 | 
39 | ## Text Content
40 | Some page 3 content
41 | 
42 | # Page 4
43 | Size: 612x792
44 | 
45 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
46 | ## Text Content
47 | Some more text
48 | 
49 | # End of Document
50 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp.txt:
--------------------------------------------------------------------------------
 1 | # Random Key: 765668851
 2 | # Document Loaded key=765668851
 3 | # Number of Pages: 2 key=765668851
 4 | 
 5 | #  Metadata Is Loaded key=765668851
 6 | ## Info key=765668851
 7 | {
 8 |   "PDFFormatVersion": "1.6",
 9 |   "Language": null,
10 |   "EncryptFilterName": null,
11 |   "IsLinearized": true,
12 |   "IsAcroFormPresent": false,
13 |   "IsXFAPresent": false,
14 |   "IsCollectionPresent": false,
15 |   "IsSignaturesPresent": false,
16 |   "CreationDate": "D:20210402144320-04'00'",
17 |   "Creator": "PScript5.dll Version 5.2.2",
18 |   "ModDate": "D:20210402154701-04'00'",
19 |   "Producer": "Acrobat Distiller 20.0 (Windows)",
20 |   "Title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)"
21 | }
22 | 
23 | ## Metadata key=765668851
24 | {
25 |   "xmp:modifydate": "2021-04-02T15:47:01-04:00",
26 |   "xmp:createdate": "2021-04-02T14:43:20-04:00",
27 |   "xmp:metadatadate": "2021-04-02T15:47:01-04:00",
28 |   "xmp:creatortool": "PScript5.dll Version 5.2.2",
29 |   "dc:format": "application/pdf",
30 |   "dc:creator": [],
31 |   "dc:title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)",
32 |   "xmpmm:documentid": "uuid:1cd7d060-dd8f-463c-bfa8-18072b031ff2",
33 |   "xmpmm:instanceid": "uuid:327587b5-f503-4f7a-b4b2-444c4ead47ad",
34 |   "pdf:producer": "Acrobat Distiller 20.0 (Windows)"
35 | }
36 | 
37 | # Page 1 key=765668851
38 | # Size: 612x792 key=765668851
39 | 
40 | Info: TT: CALL empty stack (or invalid entry).
41 | Info: TT: CALL empty stack (or invalid entry).
42 | Info: TT: CALL empty stack (or invalid entry).
43 | Info: TT: CALL empty stack (or invalid entry).
44 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
45 | Info: page=1 - getTextContent: time=141ms
46 | ## Text Content key=765668851
47 | page 1 content
48 | 
49 | # Page 2 key=765668851
50 | # Size: 612x792 key=765668851
51 | 
52 | Info: page=2 - getTextContent: time=33ms
53 | ## Text Content key=765668851
54 | page 2 content
55 | 
56 | # End of Document key=765668851


--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp2.txt:
--------------------------------------------------------------------------------
 1 | # Random Key: 367480315
 2 | # Document Loaded key=367480315
 3 | # Number of Pages: 1 key=367480315
 4 | 
 5 | # Metadata Is Loaded key=367480315
 6 | 
 7 | ## Info key=367480315
 8 | {
 9 |    "PDFFormatVersion": "1.5",
10 |    "IsLinearized": false,
11 |    "IsAcroFormPresent": true,
12 |    "IsXFAPresent": false,
13 |    "Trapped": {
14 |        "name": "False"
15 |    },
16 |    "Custom": {
17 |         "PTEX.Fullbanner": "This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/Debian) kpathsea version 6.2.3"
18 |    }
19 | }
20 | 
21 | # Page 1 key=367480315
22 | # Size: 595.276x841.89 key=367480315
23 | Info: page=1 - getTextContent: time=40ms
24 | ## Text Content key=367480315
25 | Name Copy Reset
26 | # End of Document key=367480315
27 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
 1 | #  Licensed to the Apache Software Foundation (ASF) under one or more
 2 | #  contributor license agreements.  See the NOTICE file distributed with
 3 | #  this work for additional information regarding copyright ownership.
 4 | #  The ASF licenses this file to You under the Apache License, Version 2.0
 5 | #  (the "License"); you may not use this file except in compliance with
 6 | #  the License.  You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | org.tallison.tika.parsers.pdf.PDFSpelunker


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <mime-info>
 3 |   <mime-type type="application/x-pdf-internal-fontfile3">
 4 |     <magic priority="50">
 5 |       <match value="0x010004" type="string" offset="0"/>
 6 |     </magic>
 7 |   </mime-type>
 8 |   <mime-type type="application/x-font-ttf-unk">
 9 |     <magic priority="50">
10 |       <!-- true\u0000 -->
11 |       <match value="0x7472756500" type="string" offset="0"/>
12 |     </magic>
13 |   </mime-type>
14 |   <mime-type type="application/x-font-postscript-type1">
15 |     <magic priority="50">
16 |       <match value="%!FontType1" type="string" offset="0"/>
17 |     </magic>
18 |   </mime-type>
19 |   <mime-type type="application/x-xmp-packet">
20 |     <!-- xmp packet -->
21 |     <magic priority="50">
22 |       <match value="&lt;?xpacket begin" type="string" offset="0:4"/>
23 |     </magic>
24 |   </mime-type>
25 | 
26 | </mime-info>


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/java/org/tallison/tika/parsers/image/ICCImageParserTest.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.tika.parsers.image;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | import org.apache.tika.TikaTest;
 8 | import org.apache.tika.config.TikaConfig;
 9 | import org.apache.tika.parser.AutoDetectParser;
10 | import org.apache.tika.parser.Parser;
11 | 
12 | public class ICCImageParserTest extends TikaTest {
13 | 
14 |     @Test
15 |     public void testBasic() throws Exception {
16 |         try (InputStream is = this.getClass().getResourceAsStream("/config/my-tika-config.xml")) {
17 |             Parser p = new AutoDetectParser(new TikaConfig(is));
18 |             debug(getRecursiveMetadata("baseball.jpg", p));
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/non-compliant1.txt:
--------------------------------------------------------------------------------
 1 | Profile:            'data/iccs/7c/68/7c68fd34c873bf7db8faa3a1133d176a7c92a88f8a05d482d406857ee212ce98'
 2 | Profile ID:         e798cc1d9f659a6155ac35ad9ac383bb
 3 | Size:               1829077(0x1be8d5) bytes
 4 | 
 5 | Header
 6 | ------
 7 | Attributes:         Reflective | Glossy
 8 | Cmm:                Heidelberg
 9 | Creation Date:      2/28/2007  08:00:00
10 | Creator:            'HDM ' = 48444D20
11 | Data Color Space:   CmykData
12 | Flags               EmbeddedProfileFalse | UseAnywhere
13 | PCS Color Space:    LabData
14 | Platform:           Unknown
15 | Rendering Intent:   Relative Colorimetric
16 | Profile Class:      OutputClass
17 | Profile SubClass:   Not Defined
18 | Version:            2.40
19 | Illuminant:         X=0.9642, Y=1.0000, Z=0.8249
20 | Spectral PCS:       NoSpectralData
21 | Spectral PCS Range: Not Defined
22 | BiSpectral Range:   Not Defined
23 | MCS Color Space:    Not Defined
24 | 
25 | Profile Tags
26 | ------------
27 |                          Tag    ID      Offset	    Size	     Pad
28 |                         ----  ------    ------	    ----	     ---
29 |                 copyrightTag  'cprt'       288	     103	       1
30 |           mediaWhitePointTag  'wtpt'       392	      20	       0
31 |                     AToB0Tag  'A2B0'       412	  396852	       0
32 |                     BToA0Tag  'B2A0'    397264	  291132	       0
33 |                     gamutTag  'gamt'    688396	   33840	       0
34 |                     AToB1Tag  'A2B1'    722236	  396852	       0
35 |                     BToA1Tag  'B2A1'   1119088	  291132	       0
36 |                     AToB2Tag  'A2B2'       412	  396852	       0
37 |                     BToA2Tag  'B2A2'   1410220	  291132	       0
38 |                   grayTRCTag  'kTRC'   1701352	     524	       0
39 |    Unknown 'hd10' = 68643130  'hd10'   1701876	     364	       0
40 |        profileDescriptionTag  'desc'   1702240	     152	       0
41 |                charTargetTag  'targ'   1702392	  126685	       0
42 | 
43 | 
44 | Validation Report
45 | -----------------
46 | Profile violates ICC specification
47 | 
48 | Warning! - OutputClassTag exclusion test failed.
49 | Warning! - Unknown 'hd10' = 68643130: - Unknown Tag.
50 | NonCompliant! - File size is not a multiple of 4 bytes (last tag needs padding?).


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc1.txt:
--------------------------------------------------------------------------------
1 | Unable to parse 'data/blah.tgz' as ICC profile!
2 | 
3 | Validation Report
4 | -----------------
5 | Profile has Critical Error(s) that violate ICC specification.
6 | 
7 | Error! -  - Unable to read profile!**
8 | 	Profile has invalid structure!


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc2.txt:
--------------------------------------------------------------------------------
 1 | Unable to parse 'data/iccs/86/20/862090af4442059ff416679acb001ae23acc18852f2dc430d0845c061b937e9c' as ICC profile!
 2 | 
 3 | Validation Report
 4 | -----------------
 5 | Profile has Critical Error(s) that violate ICC specification.
 6 | 
 7 | NonCompliant! - Bad Header File Size
 8 | Error! -  - AToB0Tag - Tag has invalid structure!
 9 | Error! -  - AToB1Tag - Tag has invalid structure!
10 | Error! -  - AToB2Tag - Tag has invalid structure!
11 | Error! -  - BToA0Tag - Tag has invalid structure!
12 | Error! -  - BToA1Tag - Tag has invalid structure!
13 | Error! -  - BToA2Tag - Tag has invalid structure!
14 | Error! -  - gamutTag - Tag has invalid structure!
15 | Error! -  - Unknown 'AS00' = 41533030 - Tag has invalid structure!


--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf


--------------------------------------------------------------------------------
/tika-containers/tika-pdftotext/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER
 5 | #poppler/data pairs
 6 | #21.02.0/0.4.10
 7 | #20.09.0/0.4.9
 8 | #0.86.1/0.4.9
 9 | 
10 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
11 | RUN wget https://poppler.freedesktop.org/poppler-data-0.4.11.tar.gz \
12 |     && tar -xf poppler-data-0.4.11.tar.gz \
13 |     && cd poppler-data-0.4.11 \
14 |     && make install \
15 |     && cd .. \
16 |     && wget https://poppler.freedesktop.org/poppler-21.11.0.tar.xz \
17 |     && tar -xf poppler-21.11.0.tar.xz \
18 |     && cd poppler-21.11.0 \
19 |     && mkdir build \
20 |     && cd build \
21 |     && cmake -DENABLE_BOOST=OFF .. \
22 |     && make \
23 |     && make install \
24 |     && ldconfig
25 | #CMD tail -f /dev/null
26 | 
27 | FROM amd64/openjdk:11.0.8-slim-buster
28 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
29 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
30 | 
31 | RUN apt-get update && apt-get install bash ca-certificates \
32 |                        libjpeg62-turbo libcairo2 libxml2 \
33 |                        fontconfig liblcms2-2 \
34 |                        libtiff5 -y
35 |                         # &&\
36 |                        #libopenjpeg5
37 |                        #libstdc++6 && \
38 |     #addgroup -S appgroup && \
39 |     #adduser -S appuser -G appgroup -h /work && \
40 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
41 | 
42 | RUN mkdir /tika-bin
43 | 
44 | #find a more elegant way of grabbing this after we release it
45 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar
46 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
47 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
48 | 
49 | #e.g.
50 | #docker run -d -p 9998:9998


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=23.03.0
 8 | ENV POPPLER_DATA_VERSION=0.4.12
 9 | 
10 | RUN apt-get update && apt-get install locales bash wget build-essential cmake libfreetype6-dev pkg-config  \
11 |     libfontconfig-dev libjpeg-dev libopenjp2-7-dev  \
12 |     #these are for temurin
13 |     apt-transport-https gnupg -y
14 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
15 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
16 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
17 |     && make install \
18 |     && cd .. \
19 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
20 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
21 |     && cd poppler-${POPPLER_VERSION} \
22 |     && mkdir build \
23 |     && cd build  \
24 |     && cmake -DENABLE_BOOST=OFF ..\
25 |     && make \
26 |     && make install \
27 |     && ldconfig
28 | 
29 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
30 |     && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
31 |     && apt-get update && apt-get install temurin-11-jre -y
32 | 
33 | RUN mkdir /tika-bin
34 | COPY target/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar /tika-bin
35 | COPY log4j2.xml /tika-bin
36 | COPY pipes-log4j2.xml /tika-bin
37 | 
38 | 
39 | ENV LANG en_US.UTF-8
40 | ENV LANGUAGE en_US:en
41 | ENV LC_ALL en_US.UTF-8
42 | 
43 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar"]
44 | #need to specify tika-config.xml on commandline, e.g.:
45 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345
46 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml
47 | 
48 | #WORKDIR /work
49 | 
50 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="info">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="info">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/pipes-log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="info">
22 |   <Appenders>
23 |     <File name="file" fileName="/data/logs/${sys:pipesClientId}-pipes.log">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </File>
26 |     <Console name="Console" target="SYSTEM_ERR">
27 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
28 |     </Console>
29 |   </Appenders>
30 |   <Loggers>
31 |     <Root level="info">
32 |       <AppenderRef ref="file"/>
33 |     </Root>
34 |   </Loggers>
35 | </Configuration>


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.20.2-bullseye
 2 | 
 3 | 
 4 | RUN apt-get update && apt-get install file  \
 5 |     #these are for temurin
 6 |     apt-transport-https gnupg -y
 7 | RUN go install github.com/richardlehane/siegfried/cmd/sf@latest && sf -update
 8 | 
 9 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
10 |     && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
11 |     && apt-get update && apt-get install temurin-11-jre -y
12 | 
13 | RUN mkdir /tika-bin
14 | COPY target/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar /tika-bin
15 | COPY log4j2.xml /tika-bin
16 | COPY pipes-log4j2.xml /tika-bin
17 | 
18 | 
19 | ENV LANG en_US.UTF-8
20 | ENV LANGUAGE en_US:en
21 | ENV LC_ALL en_US.UTF-8
22 | 
23 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar"]
24 | #need to specify tika-config.xml on commandline, e.g.:
25 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345
26 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml
27 | 
28 | #WORKDIR /work
29 | 
30 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="info">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="info">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/pipes-log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="info">
22 |   <Appenders>
23 |     <File name="file" fileName="/data/logs/${sys:pipesClientId}-pipes.log">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </File>
26 |     <Console name="Console" target="SYSTEM_ERR">
27 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
28 |     </Console>
29 |   </Appenders>
30 |   <Loggers>
31 |     <Root level="info">
32 |       <AppenderRef ref="file"/>
33 |     </Root>
34 |   </Loggers>
35 | </Configuration>


--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10.4-slim-buster
 2 | 
 3 | RUN apt-get update && \
 4 | apt-get install -y --no-install-recommends \
 5 |         openjdk-11-jre
 6 | 
 7 | #TODO
 8 | RUN python -m pip install --upgrade pip && pip install pypdf2==2.1.0
 9 | 
10 | RUN mkdir /pypdf2cli
11 | COPY scripts/PyPDF2Cli.py /pypdf2cli
12 | RUN chmod a+x /pypdf2cli/PyPDF2Cli.py
13 | 
14 | RUN mkdir /tika-bin
15 | 
16 | #find a more elegant way of grabbing this after we release it
17 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar
18 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
19 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
20 | 


--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <parent>
 6 |     <artifactId>file-observatory</artifactId>
 7 |     <groupId>org.tallison</groupId>
 8 |     <version>1.0.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <modelVersion>4.0.0</modelVersion>
11 | 
12 |   <artifactId>tika-pypdf2</artifactId>
13 | 
14 |   <properties>
15 |     <maven.compiler.source>11</maven.compiler.source>
16 |     <maven.compiler.target>11</maven.compiler.target>
17 |   </properties>
18 | 
19 | </project>


--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/scripts/PyPDF2Cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from PyPDF2 import PdfReader
 4 | 
 5 | reader = PdfReader(sys.argv[1])
 6 | 
 7 | # reading all the pages content one by one
 8 | with open(sys.argv[2], "w", encoding="utf-8") as output:
 9 |     for page in reader.pages:
10 |         output.write(page.extract_text())
11 |         output.write("\n")
12 | 


--------------------------------------------------------------------------------
/tool-runners/arlington/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER
 2 | 
 3 | RUN apt-get update && apt-get install g++-8 gcc-8 cmake git -y
 4 | 
 5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
 6 | 
 7 | RUN git clone  https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \
 8 |     cd /arlington-pdf-model && git checkout 908a7be
 9 | 
10 | RUN cd /arlington-pdf-model/TestGrammar && \
11 |     cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \
12 |     cmake --build cmake-linux/debug --config Debug
13 | 
14 | 
15 | FROM amd64/openjdk:11.0.8-slim-buster
16 | 
17 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin
18 | 
19 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest
20 | 
21 | COPY target/arlington-1.0.0-SNAPSHOT.jar /arlington-1.0.0-SNAPSHOT.jar
22 | 
23 | 
24 | ENTRYPOINT ["java","-jar","/arlington-1.0.0-SNAPSHOT.jar"]
25 | #WORKDIR /work
26 | # for debugging
27 | # docker run -it --entrypoint /bin/bash --name a2 -v /Users/.../Desktop/tool-runner-work:/data 806db3cdfa81
28 | 
29 | 


--------------------------------------------------------------------------------
/tool-runners/arlington/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/file-obs-tika.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234
5 | NUM_THREADS=20
6 | IS_DELTA=true


--------------------------------------------------------------------------------
/tool-runners/arlington/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=info, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/caradoc/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | FROM debian:stretch as CARADOC_BUILDER
 3 | RUN apt-get update &&\
 4 |     apt-get install -y\
 5 |         ocaml\
 6 |         opam\
 7 |         zlib1g-dev\
 8 |         libgmp-dev\
 9 |         pkg-config\
10 |         m4\
11 |         zlib1g-dev\
12 |         ocaml-findlib\
13 |         libcryptokit-ocaml-dev\
14 |         libounit-ocaml-dev\
15 |         libcurses-ocaml-dev\
16 |         menhir &&\
17 |     git clone --depth=1 --single-branch https://github.com/caradoc-org/caradoc.git
18 | WORKDIR /caradoc
19 | RUN make
20 | 
21 | 
22 | FROM amd64/openjdk:11.0.8-slim-buster
23 | COPY --from=CARADOC_BUILDER /caradoc/_build/src/main.native /usr/local/bin/caradoc
24 | # Install dependencies for caradoc binary
25 | RUN apt-get update &&\
26 |     apt-get install -y\
27 |         libtinfo5\
28 |         libncursesw5
29 | 
30 | 
31 | COPY target/caradoc-1.0.0-SNAPSHOT.jar /caradoc-1.0.0-SNAPSHOT.jar
32 | ENTRYPOINT ["java","-jar","/caradoc-1.0.0-SNAPSHOT.jar"]
33 | #e.g.
34 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container
35 | # docker build -t mutool-clean-image .
36 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10


--------------------------------------------------------------------------------
/tool-runners/caradoc/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/clamav/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://github.com/mko-x/docker-clamav/blob/master/alpine/main/Dockerfile
 3 | FROM alpine:3.12
 4 | LABEL maintainer="Markus Kosmal <code@m-ko.de>"
 5 | 
 6 | RUN apk add --no-cache openjdk11 bash clamav clamav-daemon rsyslog wget clamav-libunrar
 7 | 
 8 | COPY conf /etc/clamav
 9 | 
10 | RUN mkdir /var/run/clamav && \
11 |     chown clamav:clamav /var/run/clamav && \
12 |     chmod 750 /var/run/clamav
13 |     #&& \
14 |     #chown -R clamav:clamav bootstrap.sh check.sh /etc/clamav && \
15 |     #chmod u+x bootstrap.sh check.sh
16 | 
17 | RUN /usr/bin/freshclam
18 | #EXPOSE 3310/tcp
19 | 
20 | COPY target/clamav-1.0.0-SNAPSHOT.jar /clamav-1.0.0-SNAPSHOT.jar
21 | COPY exec.sh /exec.sh
22 | RUN ["chmod", "+x", "/exec.sh"]
23 | CMD ["/exec.sh"]
24 | 


--------------------------------------------------------------------------------
/tool-runners/clamav/conf/clam.conf:
--------------------------------------------------------------------------------
 1 | ###############
 2 | # General
 3 | ###############
 4 | 
 5 | DatabaseDirectory /var/lib/clamav
 6 | TemporaryDirectory /tmp
 7 | LogTime yes
 8 | PidFile /run/clamav/clamd.pid
 9 | LocalSocket /run/clamav/clamd.sock
10 | TCPSocket 3310
11 | Foreground no
12 | 
13 | ###############
14 | # Results
15 | ###############
16 | 
17 | DetectPUA yes
18 | ExcludePUA NetTool
19 | ExcludePUA PWTool
20 | AlgorithmicDetection yes
21 | Bytecode yes
22 | 
23 | ###############
24 | # Scan
25 | ###############
26 | 
27 | ScanPE yes
28 | DisableCertCheck yes
29 | ScanELF yes
30 | AlertBrokenExecutables yes
31 | ScanOLE2 yes
32 | ScanPDF yes
33 | ScanSWF yes
34 | ScanMail yes
35 | PhishingSignatures yes
36 | PhishingScanURLs yes
37 | ScanHTML yes
38 | ScanArchive yes
39 | 
40 | ###############
41 | # Scan
42 | ###############
43 | 
44 | MaxScanSize 300M
45 | MaxFileSize 100M
46 | MaxRecursion 30
47 | MaxFiles 50000
48 | MaxEmbeddedPE 40M
49 | MaxHTMLNormalize 40M
50 | MaxHTMLNoTags 2M
51 | MaxScriptNormalize 5M
52 | MaxZipTypeRcg 1M
53 | MaxPartitions 128
54 | MaxIconsPE 200
55 | PCREMatchLimit 10000
56 | PCRERecMatchLimit 10000


--------------------------------------------------------------------------------
/tool-runners/clamav/conf/freshclam.conf:
--------------------------------------------------------------------------------
 1 | ###############
 2 | # General
 3 | ###############
 4 | 
 5 | DatabaseDirectory /var/lib/clamav
 6 | LogSyslog yes
 7 | LogTime yes
 8 | PidFile /run/clamav/freshclam.pid
 9 | 
10 | ###############
11 | # Updates
12 | ###############
13 | 
14 | DatabaseMirror database.clamav.net
15 | ScriptedUpdates yes
16 | NotifyClamd /etc/clamav/clamd.conf
17 | SafeBrowsing yes
18 | Bytecode yes


--------------------------------------------------------------------------------
/tool-runners/clamav/exec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #!/bin/bash
 3 | # copied from: https://github.com/mko-x/docker-clamav/blob/master/alpine/main/bootstrap.sh
 4 | set -e
 5 | 
 6 | if [[ ! -z "${FRESHCLAM_CONF_FILE}" ]]; then
 7 |     echo "[bootstrap] FRESHCLAM_CONF_FILE set, copy to /etc/clamav/freshclam.conf"
 8 |     mv /etc/clamav/freshclam.conf /etc/clamav/freshclam.conf.bak
 9 |     cp -f ${FRESHCLAM_CONF_FILE} /etc/clamav/freshclam.conf
10 | fi
11 | 
12 | if [[ ! -z "${CLAMD_CONF_FILE}" ]]; then
13 |     echo "[bootstrap] CLAMD_CONF_FILE set, copy to /etc/clamav/clam.conf"
14 |     mv /etc/clamav/clamd.conf /etc/clamav/clamd.conf.bak
15 |     cp -f ${CLAMD_CONF_FILE} /etc/clamav/clamd.conf
16 | fi
17 | 
18 | MAIN_FILE="/var/lib/clamav/main.cvd"
19 | 
20 | #if [ ! -f ${MAIN_FILE} ]; then
21 | #    echo "[bootstrap] Initial clam DB download."
22 | #    /usr/bin/freshclam
23 | #fi
24 | 
25 | #echo "[bootstrap] Schedule freshclam DB updater."
26 | #/usr/bin/freshclam -d -c 6
27 | 
28 | echo "[bootstrap] Run clamav daemon..."
29 | /usr/sbin/clamd -c /etc/clamav/clam.conf
30 | echo "[bootstrap] process the files!"
31 | java -jar /clamav-1.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/tool-runners/clamav/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/file-obs-tika.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234
5 | NUM_THREADS=20
6 | IS_DELTA=true


--------------------------------------------------------------------------------
/tool-runners/fileprofiler/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 | 
3 | COPY target/fileprofiler-1.0.0-SNAPSHOT.jar /fileprofiler-1.0.0-SNAPSHOT.jar
4 | 
5 | ENTRYPOINT ["java","-jar","/fileprofiler-1.0.0-SNAPSHOT.jar"]
6 | 


--------------------------------------------------------------------------------
/tool-runners/fileprofiler/README.txt:
--------------------------------------------------------------------------------
1 | Load basic provenance information -- file size, shasum, collection


--------------------------------------------------------------------------------
/tool-runners/fileprofiler/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="WARN">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="info">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/tool-runners/gstotext/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM amd64/openjdk:11.0.8-slim-buster
 3 | RUN apt-get update && apt-get install wget -y
 4 |                         # &&\
 5 |                        #libopenjpeg5
 6 |                        #libstdc++6 && \
 7 |     #addgroup -S appgroup && \
 8 |     #adduser -S appuser -G appgroup -h /work && \
 9 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
10 | RUN wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9550/ghostscript-9.55.0-linux-x86_64.tgz \
11 |   && tar -xf ghostscript-9.55.0-linux-x86_64.tgz
12 | 
13 | COPY target/gstotext-1.0.0-SNAPSHOT.jar /gstotext-1.0.0-SNAPSHOT.jar
14 | 
15 | ENTRYPOINT ["java","-jar","/gstotext-1.0.0-SNAPSHOT.jar"]
16 | #WORKDIR /work
17 | 
18 | 


--------------------------------------------------------------------------------
/tool-runners/gstotext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/itext/README.md:
--------------------------------------------------------------------------------
1 | This wrapper of iText's parser requires a commercial license key.
2 | 
3 | This code was not written nor used with the AGPL license.
4 | 
5 | Many thanks to iText for granting a custom evaluation license for this project.


--------------------------------------------------------------------------------
/tool-runners/itext/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
 1 | #  Licensed to the Apache Software Foundation (ASF) under one or more
 2 | #  contributor license agreements.  See the NOTICE file distributed with
 3 | #  this work for additional information regarding copyright ownership.
 4 | #  The ASF licenses this file to You under the Apache License, Version 2.0
 5 | #  (the "License"); you may not use this file except in compliance with
 6 | #  the License.  You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | org.tallison.tika.parser.itext.ITextParser


--------------------------------------------------------------------------------
/tool-runners/itext/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/itext/src/test/resources/test-documents/testPDF.pdf


--------------------------------------------------------------------------------
/tool-runners/mutoolclean/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile
 3 | FROM alpine:3 as MUTOOL_BUILDER
 4 | MAINTAINER Jens Fischer
 5 | 
 6 | # install necessary packages and compile MuPDF, clean up afterwards
 7 | # include bash for debugging the build only
 8 | 
 9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary
10 | #versions 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1
11 | ENV MUTOOL_VERSION 1.19.0
12 | RUN apk add --no-cache \
13 |         git \
14 |         make \
15 |         pkgconfig \
16 |         build-base \
17 |         bash \
18 | 	&& git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \
19 |         && cd mupdf \
20 |         && git submodule update --init \
21 |         && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \
22 |         && cd / \
23 |         && rm -r mupdf \
24 |         && apk del \
25 |         git \
26 |         make \
27 |         pkgconfig \
28 |         build-base
29 | 
30 | FROM adoptopenjdk/openjdk11:alpine-slim
31 | COPY --from=MUTOOL_BUILDER /usr/local/bin /usr/local/bin
32 | COPY --from=MUTOOL_BUILDER /lib /lib
33 | 
34 | COPY target/mutoolclean-1.0.0-SNAPSHOT.jar /mutoolclean-1.0.0-SNAPSHOT.jar
35 | ENTRYPOINT ["java","-jar","/mutoolclean-1.0.0-SNAPSHOT.jar"]
36 | 
37 | #e.g.
38 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container
39 | # docker build -t mutool-clean-image .
40 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10


--------------------------------------------------------------------------------
/tool-runners/mutoolclean/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/mutooltext/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile
 3 | FROM alpine:3 as MUPDF_BUILDER
 4 | MAINTAINER Jens Fischer
 5 | 
 6 | # install necessary packages and compile MuPDF, clean up afterwards
 7 | # include bash for debugging the build only
 8 | 
 9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary
10 | #versions 1.19.0 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1
11 | ENV MUTOOL_VERSION 1.19.0
12 | RUN apk add --no-cache \
13 |         git \
14 |         make \
15 |         pkgconfig \
16 |         build-base \
17 |         bash \
18 | 	&& git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \
19 |         && cd mupdf \
20 |         && git submodule update --init \
21 |         && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \
22 |         && cd / \
23 |         && rm -r mupdf \
24 |         && apk del \
25 |         git \
26 |         make \
27 |         pkgconfig \
28 |         build-base
29 | 
30 | FROM adoptopenjdk/openjdk11:alpine-slim
31 | COPY --from=MUPDF_BUILDER /usr/local/bin /usr/local/bin
32 | COPY --from=MUPDF_BUILDER /lib /lib
33 | 
34 | COPY target/mutooltext-1.0.0-SNAPSHOT.jar /mutooltext-1.0.0-SNAPSHOT.jar
35 | ENTRYPOINT ["java","-jar","/mutooltext-1.0.0-SNAPSHOT.jar"]
36 | #RUN apk update && apk add bash
37 | # e.g.
38 | # docker build -t mutool-text-image .
39 | # docker run -i -t --name mutool-text-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-text-image /opt/java/openjdk/bin/java -jar /mutooltotext-1.0.0-SNAPSHOT.jar /input /output/txt /output/table.csv 10
40 | 


--------------------------------------------------------------------------------
/tool-runners/mutooltext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfbytes/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 | 
3 | COPY target/pdfbytes-1.0.0-SNAPSHOT.jar /pdfbytes-1.0.0-SNAPSHOT.jar
4 | 
5 | ENTRYPOINT ["java","-jar","/pdfbytes-1.0.0-SNAPSHOT.jar"]
6 | 


--------------------------------------------------------------------------------
/tool-runners/pdfbytes/src/test/java/org/tallison/pdfutils/TestVersionUnpacker.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.pdfutils;
 2 | 
 3 | 
 4 | import org.apache.tika.io.TikaInputStream;
 5 | import org.junit.Test;
 6 | 
 7 | import java.io.ByteArrayInputStream;
 8 | import java.io.InputStream;
 9 | import java.nio.charset.StandardCharsets;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | public class TestVersionUnpacker {
16 | 
17 |     @Test
18 |     public void testVersions() throws Exception {
19 |         Path p = Paths.get(TestVersionUnpacker.class.getResource("/pdf-puzzle.pdf").toURI());
20 |         System.out.println(PDFByteSniffer.getJson(p));
21 |     }
22 | 
23 |     @Test
24 |     public void testBackTracking() throws Exception {
25 |         byte[] string = "%%%EO%%EOF%%EOF".getBytes(StandardCharsets.UTF_8);
26 |         byte[] pattern = "%%EOF".getBytes(StandardCharsets.UTF_8);
27 |         StreamSearcher streamSearcher = new StreamSearcher(pattern);
28 |         InputStream is = new ByteArrayInputStream(string);
29 |         System.out.println(streamSearcher.search(is));
30 |         System.out.println(streamSearcher.search(is));
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/tool-runners/pdfbytes/src/test/resources/pdf-puzzle.pdf:
--------------------------------------------------------------------------------
  1 | %PDF-1.1
  2 | 
  3 | 1 0 obj
  4 | <<
  5 |  /Type /Catalog
  6 |  /Outlines 2 0 R
  7 |  /Pages 3 0 R
  8 | >>
  9 | endobj
 10 | 
 11 | 2 0 obj
 12 | <<
 13 |  /Type /Outlines
 14 |  /Count 0
 15 | >>
 16 | endobj
 17 | 
 18 | 3 0 obj
 19 | <<
 20 |  /Type /Pages
 21 |  /Kids [4 0 R]
 22 |  /Count 1
 23 | >>
 24 | endobj
 25 | 
 26 | 4 0 obj
 27 | <<
 28 |  /Type /Page
 29 |  /Parent 3 0 R
 30 |  /MediaBox [0 0 612 792]
 31 |  /Contents 5 0 R
 32 |  /Resources <<
 33 |              /ProcSet [/PDF /Text]
 34 |              /Font << /F1 6 0 R >>
 35 |             >>
 36 | >>
 37 | endobj
 38 | 
 39 | 5 0 obj
 40 | <<
 41 |  /Length 89
 42 |  /Filter /ASCII85Decode
 43 | >>
 44 | stream
 45 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbCgDI[]uD.RU,@;I&dE+EC!ATK:C<,*OE;u~>
 46 | endstream
 47 | endobj
 48 | 
 49 | 6 0 obj
 50 | <<
 51 |  /Type /Font
 52 |  /Subtype /Type1
 53 |  /Name /F1
 54 |  /BaseFont /Helvetica
 55 |  /Encoding /MacRomanEncoding
 56 | >>
 57 | endobj
 58 | 
 59 | xref
 60 | 0 7
 61 | 0000000000 65535 f
 62 | 0000000012 00000 n
 63 | 0000000089 00000 n
 64 | 0000000145 00000 n
 65 | 0000000214 00000 n
 66 | 0000000419 00000 n
 67 | 0000000594 00000 n
 68 | trailer
 69 | <<
 70 |  /Size 7
 71 |  /Root 1 0 R
 72 | >>
 73 | startxref
 74 | 718
 75 | %%EOF
 76 | 
 77 | 5 0 obj
 78 | <<
 79 |  /Length 89
 80 |  /Filter /ASCII85Decode
 81 | >>
 82 | stream
 83 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbD!=BJ[-=BJ[-=BJ[-=BJ[-=BI!p<,*OE;u~>
 84 | endstream
 85 | endobj
 86 | 
 87 | xref
 88 | 0 1
 89 | 0000000000 65535 f
 90 | 5 1
 91 | 0000000935 00000 n
 92 | trailer
 93 | <<
 94 |  /Size 7
 95 |  /Root 1 0 R
 96 |  /Prev 718
 97 | >>
 98 | startxref
 99 | 1110
100 | %%EOF
101 | 


--------------------------------------------------------------------------------
/tool-runners/pdfchecker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amd64/openjdk:11.0.8-slim-buster
 2 | 
 3 | #wrapper around: https://www.datalogics.com/products/pdf-tools/pdf-checker/
 4 | #need to accept license, install it on linux and then tgz the binary
 5 | #directory that is installed
 6 | 
 7 | #I'm not including pdf-checker.tgz in my repo because of license
 8 | #requirements
 9 | 
10 | RUN mkdir /pdfchecker-bin
11 | 
12 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz
13 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz
14 | 
15 | COPY target/pdfchecker-1.0.0-SNAPSHOT.jar /pdfchecker-1.0.0-SNAPSHOT.jar
16 | # to run against a single file:
17 | #/pdfchecker-bin/PDF_Checker/pdfchecker -j /pdfchecker-bin/PDF_Checker/CheckerProfiles/everything.json -i <input_file.pdf> -s <output.json>
18 | ENTRYPOINT ["java","-jar","/pdfchecker-1.0.0-SNAPSHOT.jar"]


--------------------------------------------------------------------------------
/tool-runners/pdfchecker/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfcpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile References: https://docs.docker.com/engine/reference/builder/
 2 | 
 3 | # Start from a golang base image
 4 | FROM golang:1.16.6 as builder
 5 | 
 6 | # install
 7 | 
 8 | #RUN go get github.com/pdfcpu/pdfcpu/cmd/...
 9 | RUN git clone -b v0.3.12 --depth 1 https://github.com/pdfcpu/pdfcpu /pdfcpu
10 | RUN cd /pdfcpu && git checkout tags/v0.3.12 -b v0.3.12-tag
11 | #WORKDIR $GOPATH/src/github.com/pdfcpu/pdfcpu/cmd/pdfcpu
12 | RUN cd /pdfcpu/cmd/pdfcpu && CGO_ENABLED=0 GOOS=linux go build -a -o pdfcpu .
13 | 
14 | ######## Start a new stage from scratch #######
15 | 
16 | FROM alpine:latest
17 | 
18 | RUN apk --no-cache add ca-certificates openjdk11
19 | 
20 | WORKDIR /root/
21 | 
22 | # Copy the Pre-built binary file from the previous stage
23 | COPY --from=builder /pdfcpu/cmd/pdfcpu .
24 | 
25 | # Command to run the executable
26 | #CMD ["./pdfcpu"]
27 | 
28 | COPY target/pdfcpu-1.0.0-SNAPSHOT.jar /pdfcpu-1.0.0-SNAPSHOT.jar
29 | 
30 | ENTRYPOINT ["java","-jar","/pdfcpu-1.0.0-SNAPSHOT.jar"]
31 | 


--------------------------------------------------------------------------------
/tool-runners/pdfcpu/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdffonts/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=21.12.0
 8 | ENV POPPLER_DATA_VERSION=0.4.11
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
13 |     && make install \
14 |     && cd .. \
15 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 |     && cd poppler-${POPPLER_VERSION} \
18 |     && mkdir build \
19 |     && cd build  \
20 |     && cmake -DENABLE_BOOST=OFF ..\
21 |     && make \
22 |     && make install \
23 |     && ldconfig
24 | 
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 | 
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 |                        libjpeg62-turbo libcairo2 libxml2 \
31 |                        fontconfig liblcms2-2 \
32 |                        libtiff5 -y
33 |                         # &&\
34 |                        #libopenjpeg5
35 |                        #libstdc++6 && \
36 |     #addgroup -S appgroup && \
37 |     #adduser -S appuser -G appgroup -h /work && \
38 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 | 
40 | COPY target/pdffonts-1.0.0-SNAPSHOT.jar /pdffonts-1.0.0-SNAPSHOT.jar
41 | 
42 | 
43 | ENTRYPOINT ["java","-jar","/pdffonts-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 | 
46 | 


--------------------------------------------------------------------------------
/tool-runners/pdffonts/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfid/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.1-slim-buster
 2 | 
 3 | #TODO make more efficient by factoring out a build w git, etc
 4 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 /pdfid
 5 | 
 6 | RUN apt-get update && \
 7 | apt-get install -y --no-install-recommends \
 8 |         openjdk-11-jre git
 9 | 
10 | RUN cd /pdfid && \
11 |     git clone https://github.com/DidierStevens/DidierStevensSuite.git didierstevens && \
12 |         cd /pdfid/didierstevens && \
13 |         git checkout 5f81a8f7a8aac15b580413f6f3a2ec3d72c5d10c
14 | 
15 | COPY target/pdfid-1.0.0-SNAPSHOT.jar /pdfid-1.0.0-SNAPSHOT.jar
16 | 
17 | ENTRYPOINT ["java","-jar","/pdfid-1.0.0-SNAPSHOT.jar"]
18 | 
19 | #for debugging
20 | #docker run -it --entrypoint /bin/bash
21 | 
22 | 


--------------------------------------------------------------------------------
/tool-runners/pdfid/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfimages/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=21.12.0
 8 | ENV POPPLER_DATA_VERSION=0.4.11
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
13 |     && make install \
14 |     && cd .. \
15 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 |     && cd poppler-${POPPLER_VERSION} \
18 |     && mkdir build \
19 |     && cd build  \
20 |     && cmake -DENABLE_BOOST=OFF ..\
21 |     && make \
22 |     && make install \
23 |     && ldconfig
24 | 
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 | 
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 |                        libjpeg62-turbo libcairo2 libxml2 \
31 |                        fontconfig liblcms2-2 \
32 |                        libtiff5 -y
33 |                         # &&\
34 |                        #libopenjpeg5
35 |                        #libstdc++6 && \
36 |     #addgroup -S appgroup && \
37 |     #adduser -S appuser -G appgroup -h /work && \
38 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 | 
40 | COPY target/pdfimages-1.0.0-SNAPSHOT.jar /pdfimages-1.0.0-SNAPSHOT.jar
41 | 
42 | 
43 | ENTRYPOINT ["java","-jar","/pdfimages-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 | 
46 | 


--------------------------------------------------------------------------------
/tool-runners/pdfimages/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfinfo/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=23.03.0
 8 | ENV POPPLER_DATA_VERSION=0.4.12
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config  \
10 |     libfontconfig-dev libjpeg-dev libopenjp2-7-dev  \
11 |     #these are for temurin
12 |     apt-transport-https gnupg -y
13 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
14 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
15 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
16 |     && make install \
17 |     && cd .. \
18 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
19 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
20 |     && cd poppler-${POPPLER_VERSION} \
21 |     && mkdir build \
22 |     && cd build  \
23 |     && cmake -DENABLE_BOOST=OFF ..\
24 |     && make \
25 |     && make install \
26 |     && ldconfig
27 | 
28 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
29 |     && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
30 |     && apt-get update && apt-get install temurin-11-jre -y
31 | 
32 | COPY target/pdfinfo-1.0.0-SNAPSHOT.jar /pdfinfo-1.0.0-SNAPSHOT.jar
33 | 
34 | 
35 | ENTRYPOINT ["java","-jar","/pdfinfo-1.0.0-SNAPSHOT.jar"]
36 | #WORKDIR /work
37 | 
38 | 


--------------------------------------------------------------------------------
/tool-runners/pdfinfo/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfminerdump/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.1-slim-buster
 2 | 
 3 | RUN pip install pdfminer.six==20201018
 4 | 
 5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2
 6 | 
 7 | RUN apt-get update && \
 8 | apt-get install -y --no-install-recommends \
 9 |         openjdk-11-jre
10 | 
11 | COPY target/pdfminerdump-1.0.0-SNAPSHOT.jar /pdfminerdump-1.0.0-SNAPSHOT.jar
12 | 
13 | ENTRYPOINT ["java","-jar","/pdfminerdump-1.0.0-SNAPSHOT.jar"]
14 | 
15 | 


--------------------------------------------------------------------------------
/tool-runners/pdfminerdump/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfminertext/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.1-slim-buster
 2 | 
 3 | RUN pip install pdfminer.six==20201018
 4 | 
 5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2
 6 | 
 7 | RUN apt-get update && \
 8 | apt-get install -y --no-install-recommends \
 9 |         openjdk-11-jre
10 | 
11 | COPY target/pdfminertext-1.0.0-SNAPSHOT.jar /pdfminertext-1.0.0-SNAPSHOT.jar
12 | 
13 | ENTRYPOINT ["java","-jar","/pdfminertext-1.0.0-SNAPSHOT.jar"]
14 | 
15 | 


--------------------------------------------------------------------------------
/tool-runners/pdfminertext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amd64/openjdk:11.0.8-slim-buster
 2 | 
 3 | RUN apt-get update && \
 4 | apt-get install -y --no-install-recommends \
 5 |       pdfresurrect
 6 | 
 7 | COPY target/pdfresurrect-1.0.0-SNAPSHOT.jar /pdfresurrect-1.0.0-SNAPSHOT.jar
 8 | 
 9 | ENTRYPOINT ["java","-jar","/pdfresurrect-1.0.0-SNAPSHOT.jar"]
10 | 
11 | 


--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/tika-tika-config.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:2345/somedb?user=qwertyuiop&password=qwertyuiop
5 | NUM_THREADS=20
6 | IS_DELTA=false


--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdftoppm/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=21.12.0
 8 | ENV POPPLER_DATA_VERSION=0.4.11
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
13 |     && make install \
14 |     && cd .. \
15 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 |     && cd poppler-${POPPLER_VERSION} \
18 |     && mkdir build \
19 |     && cd build  \
20 |     && cmake -DENABLE_BOOST=OFF ..\
21 |     && make \
22 |     && make install \
23 |     && ldconfig
24 | 
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 | 
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 |                        libjpeg62-turbo libcairo2 libxml2 \
31 |                        fontconfig liblcms2-2 \
32 |                        libtiff5 -y
33 |                         # &&\
34 |                        #libopenjpeg5
35 |                        #libstdc++6 && \
36 |     #addgroup -S appgroup && \
37 |     #adduser -S appuser -G appgroup -h /work && \
38 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 | 
40 | COPY target/pdftoppm-1.0.0-SNAPSHOT.jar /pdftoppm-1.0.0-SNAPSHOT.jar
41 | 
42 | 
43 | ENTRYPOINT ["java","-jar","/pdftoppm-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 | 


--------------------------------------------------------------------------------
/tool-runners/pdftoppm/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdftops/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=21.12.0
 8 | ENV POPPLER_DATA_VERSION=0.4.11
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
13 |     && make install \
14 |     && cd .. \
15 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 |     && cd poppler-${POPPLER_VERSION} \
18 |     && mkdir build \
19 |     && cd build  \
20 |     && cmake -DENABLE_BOOST=OFF ..\
21 |     && make \
22 |     && make install \
23 |     && ldconfig
24 | #CMD tail -f /dev/null
25 | 
26 | FROM amd64/openjdk:11.0.8-slim-buster
27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
29 | 
30 | RUN apt-get update && apt-get install bash ca-certificates \
31 |                        libjpeg62-turbo libcairo2 libxml2 \
32 |                        fontconfig liblcms2-2 \
33 |                        libtiff5 -y
34 |                         # &&\
35 |                        #libopenjpeg5
36 |                        #libstdc++6 && \
37 |     #addgroup -S appgroup && \
38 |     #adduser -S appuser -G appgroup -h /work && \
39 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
40 | 
41 | COPY target/pdftops-1.0.0-SNAPSHOT.jar /pdftops-1.0.0-SNAPSHOT.jar
42 | 
43 | 
44 | ENTRYPOINT ["java","-jar","/pdftops-1.0.0-SNAPSHOT.jar"]
45 | #WORKDIR /work
46 | 
47 | 


--------------------------------------------------------------------------------
/tool-runners/pdftops/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/pdftotext/Dockerfile:
--------------------------------------------------------------------------------
 1 | #slight modification from:
 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
 6 | # migrate to 22.x
 7 | ENV POPPLER_VERSION=21.12.0
 8 | ENV POPPLER_DATA_VERSION=0.4.11
 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 |     && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 |     && cd poppler-data-${POPPLER_DATA_VERSION} \
13 |     && make install \
14 |     && cd .. \
15 |     && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 |     && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 |     && cd poppler-${POPPLER_VERSION} \
18 |     && mkdir build \
19 |     && cd build  \
20 |     && cmake -DENABLE_BOOST=OFF ..\
21 |     && make \
22 |     && make install \
23 |     && ldconfig
24 | #CMD tail -f /dev/null
25 | 
26 | FROM amd64/openjdk:11.0.8-slim-buster
27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
29 | 
30 | RUN apt-get update && apt-get install bash ca-certificates \
31 |                        libjpeg62-turbo libcairo2 libxml2 \
32 |                        fontconfig liblcms2-2 \
33 |                        libtiff5 -y
34 |                         # &&\
35 |                        #libopenjpeg5
36 |                        #libstdc++6 && \
37 |     #addgroup -S appgroup && \
38 |     #adduser -S appuser -G appgroup -h /work && \
39 |     #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
40 | 
41 | COPY target/pdftotext-1.0.0-SNAPSHOT.jar /pdftotext-1.0.0-SNAPSHOT.jar
42 | 
43 | 
44 | ENTRYPOINT ["java","-jar","/pdftotext-1.0.0-SNAPSHOT.jar"]
45 | #WORKDIR /work
46 | 
47 | 


--------------------------------------------------------------------------------
/tool-runners/pdftotext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/polyfile/Dockerfile:
--------------------------------------------------------------------------------
 1 | # this should be cleaned up dramatically
 2 | # I tried to build polyfile in a base container and then
 3 | # copy the right bits into the final image, but I couldn't figure
 4 | # out how to get all the dependencies...so this is backwards
 5 | # from the other docker files: build the java first, then
 6 | # copy that jar into the build container for polyfile.
 7 | 
 8 | FROM python:3.10.4-alpine3.15
 9 | RUN apk add --no-cache \
10 |  #       git \
11 |         bash \
12 |         libffi-dev \
13 |         zlib \
14 |         build-base py-pip jpeg-dev zlib-dev \
15 |         openjdk11-jre
16 | #	&& git clone -b v0.1.6 https://github.com/trailofbits/polyfile.git
17 | 
18 | 
19 | ENV LIBRARY_PATH=/lib:/usr/lib
20 | 
21 | #RUN cd polyfile && pip3 install -e .
22 | 
23 | RUN pip3 install polyfile==0.4.2
24 | 
25 | COPY target/polyfile-1.0.0-SNAPSHOT.jar /polyfile-1.0.0-SNAPSHOT.jar
26 | 
27 | ENTRYPOINT ["java","-jar","/polyfile-1.0.0-SNAPSHOT.jar"]
28 | 


--------------------------------------------------------------------------------
/tool-runners/polyfile/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/qpdf/Dockerfile:
--------------------------------------------------------------------------------
 1 | #fortunately the latest is available prebuilt (for now)
 2 | # for future reference, start with something like this
 3 |    #curl g++ \
 4 |   ##	&& curl -o qpdf-10.0.1.tgz https://gigenet.dl.sourceforge.net/project/qpdf/qpdf/10.0.1/qpdf-10.0.1.tar.gz \
 5 |   # #   && tar -xzvf qpdf-10.0.1.tgz
 6 |   #
 7 |   ##RUN cd qpdf-10.0.1 && \
 8 |   # #   ./configure
 9 |   #
10 |   ##RUN make install
11 | 
12 | #alpine version dictates which qpdf version is available.
13 | #see e.g. https://pkgs.alpinelinux.org/packages?name=qpdf&branch=v3.13
14 | #to search for a match
15 | FROM alpine:edge
16 | RUN apk add --no-cache \
17 |     qpdf=11.1.1-r0 \
18 |     openjdk11-jre
19 | 
20 | 
21 | COPY target/qpdf-1.0.0-SNAPSHOT.jar /qpdf-1.0.0-SNAPSHOT.jar
22 | 
23 | ENTRYPOINT ["java","-jar","/qpdf-1.0.0-SNAPSHOT.jar"]
24 | 
25 | 
26 | # e.g.
27 | # docker build -t qpdf-image .
28 | 
29 | # docker run --name qpdf-container --network host --env-file env.properties -v /data/docs:/input -v /data/meta/qpdf/json:/output


--------------------------------------------------------------------------------
/tool-runners/qpdf/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/tika-client/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | 
 3 | <!--
 4 |   Licensed to the Apache Software Foundation (ASF) under one
 5 |   or more contributor license agreements.  See the NOTICE file
 6 |   distributed with this work for additional information
 7 |   regarding copyright ownership.  The ASF licenses this file
 8 |   to you under the Apache License, Version 2.0 (the
 9 |   "License"); you may not use this file except in compliance
10 |   with the License.  You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |   Unless required by applicable law or agreed to in writing,
15 |   software distributed under the License is distributed on an
16 |   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 |   KIND, either express or implied.  See the License for the
18 |   specific language governing permissions and limitations
19 |   under the License.
20 | -->
21 | <Configuration status="WARN">
22 |   <Appenders>
23 |     <Console name="Console" target="SYSTEM_ERR">
24 |       <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
25 |     </Console>
26 |   </Appenders>
27 |   <Loggers>
28 |     <Root level="debug">
29 |       <AppenderRef ref="Console"/>
30 |     </Root>
31 |   </Loggers>
32 | </Configuration>


--------------------------------------------------------------------------------
/tool-runners/tika/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Configuration>
 3 |     <Appenders>
 4 |         <CONSOLE name="STDOUT">
 5 |             <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %t %-5p %c{1}:%L - %m%n"/>
 6 |         </CONSOLE>
 7 |     </Appenders>
 8 |     <Loggers>
 9 |         <Root level="debug">
10 |             <AppenderRef ref="STDOUT"/>
11 |         </Root>
12 |     </Loggers>
13 | </Configuration>


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amd64/openjdk:11.0.8-slim-buster
 2 | 
 3 | 
 4 | RUN apt-get update && apt-get install -y wget ghostscript
 5 | 
 6 | RUN mkdir /pkg && cd /pkg && \
 7 |     wget https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && \
 8 |     tar -xzvf xpdf-tools-linux-4.03.tar.gz && \
 9 |     mv xpdf-tools-linux-4.03 /opt/xpdf-tools-linux-4.03
10 | 
11 | RUN mkdir /usr/local/share/ghostscript && \
12 |     mkdir /usr/local/share/ghostscript/fonts
13 | 
14 | COPY tgzs/xpdf-t1fonts/*.pfb /usr/local/share/ghostscript/fonts/
15 | 
16 | COPY xpdfrc /usr/local/etc/xpdfrc
17 | 
18 | COPY xpdf /usr/local/share/xpdf
19 | 
20 | 
21 | ENV PATH "${PATH}:/opt/xpdf-tools-linux-4.03/bin64"
22 | 
23 | 
24 | COPY target/xpdffonts-1.0.0-SNAPSHOT.jar /xpdffonts-1.0.0-SNAPSHOT.jar
25 | 
26 | 
27 | ENTRYPOINT ["java","-jar","/xpdffonts-1.0.0-SNAPSHOT.jar"]
28 | #WORKDIR /work
29 | 
30 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Arabic support package
 2 | ============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Arabic PDF files.
14 | 
15 | Contents:
16 | - ISO-8859-6 encoding
17 | 
18 | Place all of these files in a directory, typically:
19 | 
20 |     Unix - /usr/local/share/xpdf/arabic
21 |     Win32 - C:\Program Files\xpdf\arabic
22 | 
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 | 
26 |     Unix - /usr/local/etc/xpdfrc
27 |     Win32 - C:\Program Files\xpdf\xpdfrc
28 | 
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 | 
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Arabic support package (2011-aug-15)
2 | unicodeMap	ISO-8859-6	/usr/local/share/xpdf/arabic/ISO-8859-6.unicodeMap
3 | #----- end Arabic support package
4 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Chinese Simplified support package
 2 | ========================================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2020-dec-22
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Chinese (Simplified) PDF files.
14 | 
15 | Contents:
16 | - Adobe-GB1 character collection support
17 | - ISO-2022-CN encoding
18 | - EUC-CN encoding
19 | - GBK encoding
20 | 
21 | Place all of these files in a directory, typically:
22 | 
23 |     Unix - /usr/local/share/xpdf/chinese-simplified
24 |     Win32 - C:\Program Files\xpdf\chinese-simplified
25 | 
26 | Add the contents of the "add-to-xpdfrc" file to your system-wide
27 | xpdfrc config file, which is typically:
28 | 
29 |     Unix - /usr/local/etc/xpdfrc
30 |     Win32 - C:\Program Files\xpdf\xpdfrc
31 | 
32 | Alternatively, on Unix systems you can add these lines to your
33 | personal xpdfrc file in $HOME/.xpdfrc.
34 | 
35 | Make sure to edit the added lines to use the actual directory where
36 | the files were installed.
37 | 
38 | To display PDF files that refer to non-embedded Chinese fonts, you
39 | will need to install a Chinese font.  Free TrueType/OpenType fonts are
40 | available:
41 | 
42 |     http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gkai00mp.ttf.gz
43 |     http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gbsn00lp.ttf.gz
44 |     https://www.google.com/get/noto/
45 | 
46 | After installing a Chinese font, add an appropriate "fontFileCC" line
47 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
48 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/add-to-xpdfrc:
--------------------------------------------------------------------------------
 1 | #----- begin Chinese Simplified support package (2011-sep-02)
 2 | cidToUnicode	Adobe-GB1	/usr/local/share/xpdf/chinese-simplified/Adobe-GB1.cidToUnicode
 3 | unicodeMap	ISO-2022-CN	/usr/local/share/xpdf/chinese-simplified/ISO-2022-CN.unicodeMap
 4 | unicodeMap	EUC-CN		/usr/local/share/xpdf/chinese-simplified/EUC-CN.unicodeMap
 5 | unicodeMap	GBK		/usr/local/share/xpdf/chinese-simplified/GBK.unicodeMap
 6 | cMapDir		Adobe-GB1	/usr/local/share/xpdf/chinese-simplified/CMap
 7 | toUnicodeDir			/usr/local/share/xpdf/chinese-simplified/CMap
 8 | #fontFileCC	Adobe-GB1	/usr/..../NotoSansCJKsc-Regular.otf
 9 | #----- end Chinese Simplified support package
10 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Chinese Traditional support package
 2 | =========================================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2020-dec-22
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Chinese (Traditional) PDF files.
14 | 
15 | Contents:
16 | - Adobe-CNS1 character collection support
17 | - Big5 encoding
18 | - Big5ascii encoding (same as Big5, but includes 7-bit ASCII)
19 | 
20 | Place all of these files in a directory, typically:
21 | 
22 |     Unix - /usr/local/share/xpdf/chinese-traditional
23 |     Win32 - C:\Program Files\xpdf\chinese-traditional
24 | 
25 | Add the contents of the "add-to-xpdfrc" file to your system-wide
26 | xpdfrc config file, which is typically:
27 | 
28 |     Unix - /usr/local/etc/xpdfrc
29 |     Win32 - C:\Program Files\xpdf\xpdfrc
30 | 
31 | Alternatively, on Unix systems you can add these lines to your
32 | personal xpdfrc file in $HOME/.xpdfrc.
33 | 
34 | Make sure to edit the added lines to use the actual directory where
35 | the files were installed.
36 | 
37 | To display PDF files that refer to non-embedded Chinese fonts, you
38 | will need to install a Chinese font.  Free TrueType/OpenType fonts are
39 | available:
40 | 
41 |     http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bkai00mp.ttf.gz
42 |     http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bsmi00lp.ttf.gz
43 |     https://www.google.com/get/noto/
44 | 
45 | After installing a Chinese font, add an appropriate "fontFileCC" line
46 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
47 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Chinese Traditional support package (2011-sep-02)
2 | cidToUnicode	Adobe-CNS1	/usr/local/share/xpdf/chinese-traditional/Adobe-CNS1.cidToUnicode
3 | unicodeMap	Big5		/usr/local/share/xpdf/chinese-traditional/Big5.unicodeMap
4 | unicodeMap	Big5ascii	/usr/local/share/xpdf/chinese-traditional/Big5ascii.unicodeMap
5 | cMapDir		Adobe-CNS1	/usr/local/share/xpdf/chinese-traditional/CMap
6 | toUnicodeDir			/usr/local/share/xpdf/chinese-traditional/CMap
7 | #fontFileCC	Adobe-CNS1	/usr/..../NotoSansCJKtc-Regular.otf"
8 | #----- end Chinese Traditional support package
9 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Cyrillic support package
 2 | ==============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Cyrillic PDF files.
14 | 
15 | Contents:
16 | - Bulgarian character names
17 | - KOI8-R encoding
18 | 
19 | Place all of these files in a directory, typically:
20 | 
21 |     Unix - /usr/local/share/xpdf/cyrillic
22 |     Win32 - C:\Program Files\xpdf\cyrillic
23 | 
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 | 
27 |     Unix - /usr/local/etc/xpdfrc
28 |     Win32 - C:\Program Files\xpdf\xpdfrc
29 | 
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 | 
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Cyrillic support package (2011-aug-15)
2 | nameToUnicode			/usr/local/share/xpdf/cyrillic/Bulgarian.nameToUnicode
3 | unicodeMap	KOI8-R		/usr/local/share/xpdf/cyrillic/KOI8-R.unicodeMap
4 | #----- end Cyrillic support package
5 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Greek support package
 2 | ===========================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Greek PDF files.
14 | 
15 | Contents:
16 | - Greek character names (alternates)
17 | - ISO-8859-7 encoding
18 | 
19 | Place all of these files in a directory, typically:
20 | 
21 |     Unix - /usr/local/share/xpdf/greek
22 |     Win32 - C:\Program Files\xpdf\greek
23 | 
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 | 
27 |     Unix - /usr/local/etc/xpdfrc
28 |     Win32 - C:\Program Files\xpdf\xpdfrc
29 | 
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 | 
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Greek support package (2011-aug-15)
2 | nameToUnicode			/usr/local/share/xpdf/greek/Greek.nameToUnicode
3 | unicodeMap	ISO-8859-7	/usr/local/share/xpdf/greek/ISO-8859-7.unicodeMap
4 | #----- end Greek support package
5 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Hebrew support package
 2 | ============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Hebrew PDF files.
14 | 
15 | Contents:
16 | - ISO-8859-8 encoding
17 | - Windows-1255 encoding
18 | 
19 | Place all of these files in a directory, typically:
20 | 
21 |     Unix - /usr/local/share/xpdf/hebrew
22 |     Win32 - C:\Program Files\xpdf\hebrew
23 | 
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 | 
27 |     Unix - /usr/local/etc/xpdfrc
28 |     Win32 - C:\Program Files\xpdf\xpdfrc
29 | 
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 | 
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Hebrew support package (2011-aug-15)
2 | unicodeMap	ISO-8859-8	/usr/local/share/xpdf/hebrew/ISO-8859-8.unicodeMap
3 | unicodeMap	Windows-1255	/usr/local/share/xpdf/hebrew/Windows-1255.unicodeMap
4 | #----- end Hebrew support package
5 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Japanese support package
 2 | ==============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2020-dec-22
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Japanese PDF files.
14 | 
15 | Contents:
16 | - Adobe-Japan1 character collection support
17 | - ISO-2022-JP encoding
18 | - EUC-JP encoding
19 | - Shift-JIS encoding
20 | 
21 | Place all of these files in a directory, typically:
22 | 
23 |     Unix - /usr/local/share/xpdf/japanese
24 |     Win32 - C:\Program Files\xpdf\japanese
25 | 
26 | Add the contents of the "add-to-xpdfrc" file to your system-wide
27 | xpdfrc config file, which is typically:
28 | 
29 |     Unix - /usr/local/etc/xpdfrc
30 |     Win32 - C:\Program Files\xpdf\xpdfrc
31 | 
32 | Alternatively, on Unix systems you can add these lines to your
33 | personal xpdfrc file in $HOME/.xpdfrc.
34 | 
35 | Make sure to edit the added lines to use the actual directory where
36 | the files were installed.
37 | 
38 | To display PDF files that refer to non-embedded Japanese fonts, you
39 | will need to install a Japanese font.  Free TrueType/OpenType fonts
40 | are available:
41 | 
42 |     http://packages.debian.org/stable/x11/ttf-kochi-mincho
43 |     http://packages.debian.org/stable/x11/ttf-kochi-gothic
44 |     https://www.google.com/get/noto/
45 | 
46 | After installing a Japanese font, add an appropriate "fontFileCC" line
47 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
48 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese/add-to-xpdfrc:
--------------------------------------------------------------------------------
 1 | #----- begin Japanese support package (2011-sep-02)
 2 | cidToUnicode	Adobe-Japan1	/usr/local/share/xpdf/japanese/Adobe-Japan1.cidToUnicode
 3 | unicodeMap	ISO-2022-JP	/usr/local/share/xpdf/japanese/ISO-2022-JP.unicodeMap
 4 | unicodeMap	EUC-JP		/usr/local/share/xpdf/japanese/EUC-JP.unicodeMap
 5 | unicodeMap	Shift-JIS	/usr/local/share/xpdf/japanese/Shift-JIS.unicodeMap
 6 | cMapDir		Adobe-Japan1	/usr/local/share/xpdf/japanese/CMap
 7 | toUnicodeDir			/usr/local/share/xpdf/japanese/CMap
 8 | #fontFileCC	Adobe-Japan1	/usr/..../NotoSansCJKjp-Regular.otf
 9 | #----- end Japanese support package
10 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Korean support package
 2 | ============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2020-dec-22
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002-2005 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Korean PDF files.
14 | 
15 | Contents:
16 | - Adobe-Korea1 character collection support
17 | - Adobe-KR character collection support
18 | - ISO-2022-KR encoding
19 | 
20 | Place all of these files in a directory, typically:
21 | 
22 |     Unix - /usr/local/share/xpdf/korean
23 |     Win32 - C:\Program Files\xpdf\korean
24 | 
25 | Add the contents of the "add-to-xpdfrc" file to your system-wide
26 | xpdfrc config file, which is typically:
27 | 
28 |     Unix - /usr/local/etc/xpdfrc
29 |     Win32 - C:\Program Files\Xpdf\xpdfrc
30 | 
31 | Alternatively, on Unix systems you can add these lines to your
32 | personal xpdfrc file in $HOME/.xpdfrc.
33 | 
34 | Make sure to edit the added lines to use the actual directory where
35 | the files were installed.
36 | 
37 | To display PDF files that refer to non-embedded Korean fonts, you will
38 | need to install a Korean font.  Free TrueType/OpenType fonts are
39 | available:
40 | 
41 |     ftp://ftp.mizi.com/pub/baekmuk/baekmuk-ttf-2.1.tar.gz
42 |     https://www.google.com/get/noto/
43 | 
44 | After installing a Korean font, add appropriate "fontFileCC"
45 | lines to your xpdfrc file (see the sample in "add-to-xpdfrc").
46 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean/add-to-xpdfrc:
--------------------------------------------------------------------------------
 1 | #----- begin Korean support package (2011-sep-02)
 2 | cidToUnicode	Adobe-Korea1	/usr/local/share/xpdf/korean/Adobe-Korea1.cidToUnicode
 3 | cidToUnicode	Adobe-KR	/usr/local/share/xpdf/korean/Adobe-KR.cidToUnicode
 4 | unicodeMap	ISO-2022-KR	/usr/local/share/xpdf/korean/ISO-2022-KR.unicodeMap
 5 | cMapDir		Adobe-Korea1	/usr/local/share/xpdf/korean/CMap
 6 | cMapDir		Adobe-KR	/usr/local/share/xpdf/korean/CMap
 7 | toUnicodeDir			/usr/local/share/xpdf/korean/CMap
 8 | #fontFileCC	Adobe-Korea1	/usr/..../NotoSansCJKkr-Regular.otf
 9 | #fontFileCC	Adobe-KR	/usr/..../NotoSansCJKkr-Regular.otf
10 | #----- end Korean support package
11 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Latin2 support package
 2 | ============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Latin2 PDF files.
14 | 
15 | Contents:
16 | - Latin2 encoding
17 | 
18 | Place all of these files in a directory, typically:
19 | 
20 |     Unix - /usr/local/share/xpdf/latin2
21 |     Win32 - C:\Program Files\xpdf\latin2
22 | 
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 | 
26 |     Unix - /usr/local/etc/xpdfrc
27 |     Win32 - C:\Program Files\xpdf\xpdfrc
28 | 
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 | 
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Latin2 support package (2011-aug-15)
2 | unicodeMap	Latin2	/usr/local/share/xpdf/latin2/Latin2.unicodeMap
3 | #----- end Latin2 support package
4 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/README:
--------------------------------------------------------------------------------
 1 | This package contains two fonts:
 2 | 
 3 |     s050000l.pfb -- Symbol
 4 |     d050000l.pfb -- Zapf Dingbats
 5 | 
 6 | These fonts are substitutes for the corresponding Base-14 fonts.  They
 7 | are part of the font set contributed to the ghostscript project by
 8 | URW++ Design and Development Incorporated of Hamburg, Germany
 9 | (http://www.urwpp.de/).  They have been released under the GNU General
10 | Public License (GPL) v2 -- see the "COPYING" file.
11 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Thai support package
 2 | ==========================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Thai PDF files.
14 | 
15 | Contents:
16 | - Thai character names
17 | - TIS-620 encoding
18 | 
19 | Place all of these files in a directory, typically:
20 | 
21 |     Unix - /usr/local/share/xpdf/thai
22 |     Win32 - C:\Program Files\xpdf\thai
23 | 
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 | 
27 |     Unix - /usr/local/etc/xpdfrc
28 |     Win32 - C:\Program Files\xpdf\xpdfrc
29 | 
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 | 
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Thai support package (2011-aug-15)
2 | nameToUnicode			/usr/local/share/xpdf/thai/Thai.nameToUnicode
3 | unicodeMap	TIS-620		/usr/local/share/xpdf/thai/TIS-620.unicodeMap
4 | #----- end Thai support package
5 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish/README:
--------------------------------------------------------------------------------
 1 | Xpdf: Turkish support package
 2 | =============================
 3 | 
 4 | Xpdf project: http://www.foolabs.com/xpdf/
 5 | 2011-aug-15
 6 | 
 7 | If this package includes CMap files, they contain their own copyright
 8 | notices and distribution conditions.  All other files in the package
 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 | 
12 | This package provides support files needed to use the Xpdf tools with
13 | Turkish PDF files.
14 | 
15 | Contents:
16 | - ISO-8859-9 encoding
17 | 
18 | Place all of these files in a directory, typically:
19 | 
20 |     Unix - /usr/local/share/xpdf/turkish
21 |     Win32 - C:\Program Files\xpdf\turkish
22 | 
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 | 
26 |     Unix - /usr/local/etc/xpdfrc
27 |     Win32 - C:\Program Files\xpdf\xpdfrc
28 | 
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 | 
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Turkish support package (2011-aug-15)
2 | unicodeMap	ISO-8859-9	/usr/local/share/xpdf/turkish/ISO-8859-9.unicodeMap
3 | #----- end Turkish support package
4 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/arabic/ISO-8859-6.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 00a0 20
 5 | 00a4 a4
 6 | 00ad ad
 7 | 02c6 5e
 8 | 02dc 7e
 9 | 060c 060c ac
10 | 061b 061b bb
11 | 061f 061f bf
12 | 0621 063a c1
13 | 0640 0652 e0
14 | 2013 2013 ad
15 | 2014 2014 2d2d
16 | 2018 2018 60
17 | 2019 2019 27
18 | 201a 201a 2c
19 | 201c 201c 22
20 | 201d 201d 22
21 | 201e 201e 2c2c
22 | 2026 2026 2e2e2e
23 | 2039 2039 3c
24 | 203a 203a 3e
25 | 2044 2044 2f
26 | 2122 2122 544d
27 | 2212 2212 2d
28 | f6f9 f6f9 4c
29 | f6fe f6fe 7e
30 | f721 f721 21
31 | f724 f724 24
32 | f726 f726 26
33 | f730 f739 30
34 | f73f f73f 3f
35 | f761 f77a 41
36 | fb00 fb00 6666
37 | fb01 fb01 6669
38 | fb02 fb02 666c
39 | fb03 fb03 666669
40 | fb04 fb04 66666c
41 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/GBpc-EUC-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap%%DocumentNeededResources: ProcSet (CIDInit)%%DocumentNeededResources: CMap (GBpc-EUC-UCS2C)%%IncludeResource: ProcSet (CIDInit)%%IncludeResource: CMap (GBpc-EUC-UCS2C)%%BeginResource: CMap (GBpc-EUC-UCS2)%%Title: (GBpc-EUC-UCS2)%%Version: 4.002%%Copyright: -----------------------------------------------------------%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.%%Copyright: All Rights Reserved.%%Copyright:%%Copyright: Patents Pending%%Copyright:%%Copyright: NOTICE: All information contained herein is the property%%Copyright: of Adobe Systems Incorporated.%%Copyright:%%Copyright: Permission is granted for redistribution of this file%%Copyright: provided this copyright notice is maintained intact and%%Copyright: that the contents of this file are not altered in any%%Copyright: way from its original form.%%Copyright:%%Copyright: PostScript and Display PostScript are trademarks of%%Copyright: Adobe Systems Incorporated which may be registered in%%Copyright: certain jurisdictions.%%Copyright: -----------------------------------------------------------%%EndComments/CIDInit /ProcSet findresource begin12 dict beginbegincmap/GBpc-EUC-UCS2C usecmap/CIDSystemInfo 3 dict dup begin  /Registry (Adobe) def  /Ordering (GBpc_EUC_UCS2) def  /Supplement 2 defend def/CMapName /GBpc-EUC-UCS2 def/CMapVersion 4.002 def/CMapType 1 def/WMode 0 def1 beginbfrange<a8bf>	<a8bf>	<006e0300>endbfrangeendcmapCMapName currentdict /CMap defineresource popendend%%EndResource%%EOF


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 1990-2019 Adobe. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | 
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | 
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/B5pc-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap%%DocumentNeededResources: ProcSet (CIDInit)%%DocumentNeededResources: CMap (B5pc-UCS2)%%IncludeResource: ProcSet (CIDInit)%%IncludeResource: CMap (B5pc-UCS2C)%%BeginResource: CMap (B5pc-UCS2)%%Title: (B5pc-UCS2)%%Version: 4.002%%Copyright: -----------------------------------------------------------%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.%%Copyright: All Rights Reserved.%%Copyright:%%Copyright: Patents Pending%%Copyright:%%Copyright: NOTICE: All information contained herein is the property%%Copyright: of Adobe Systems Incorporated.%%Copyright:%%Copyright: Permission is granted for redistribution of this file%%Copyright: provided this copyright notice is maintained intact and%%Copyright: that the contents of this file are not altered in any%%Copyright: way from its original form.%%Copyright:%%Copyright: PostScript and Display PostScript are trademarks of%%Copyright: Adobe Systems Incorporated which may be registered in%%Copyright: certain jurisdictions.%%Copyright: -----------------------------------------------------------%%EndComments/CIDInit /ProcSet findresource begin12 dict beginbegincmap/B5pc-UCS2C usecmap/CIDSystemInfo 3 dict dup begin  /Registry (Adobe) def  /Ordering (B5pc_UCS2) def  /Supplement 0 defend def/CMapName /B5pc-UCS2 def/CMapVersion 4.002 def/CMapType 1 def/WMode 0 defendcmapCMapName currentdict /CMap defineresource popendend%%EndResource%%EOF


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 1990-2019 Adobe. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | 
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | 
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/cyrillic/Bulgarian.nameToUnicode:
--------------------------------------------------------------------------------
 1 | 0410 As
 2 | 0411 Buki
 3 | 0412 Wjedi
 4 | 0413 Glagol
 5 | 0414 Dobro
 6 | 0415 Jest
 7 | 0416 Schiwete
 8 | 0417 Selmja
 9 | 0418 Ische
10 | 0419 Ischebreve
11 | 041a Kako
12 | 041b Ljudi
13 | 041c Muislete
14 | 041d Nasche
15 | 041e On
16 | 041f Pakoj
17 | 0420 Rzui
18 | 0421 Slovo
19 | 0422 Twerdo
20 | 0423 Uk
21 | 0424 Fert
22 | 0425 Cherr
23 | 0426 Zui
24 | 0427 Tscherw
25 | 0428 Scha
26 | 0429 Schtscha
27 | 042a Jerr
28 | 042e Ju
29 | 042f Ja
30 | 0430 as
31 | 0431 buki
32 | 0432 wjedi
33 | 0433 glagol
34 | 0434 dobro
35 | 0435 jest
36 | 0436 schiwete
37 | 0437 selmja
38 | 0438 ische
39 | 0439 ischebreve
40 | 043a kako
41 | 043b ljudi
42 | 043c muislete
43 | 043d nasche
44 | 043e on
45 | 043f pakoj
46 | 0440 rzui
47 | 0441 slovo
48 | 0442 twerdo
49 | 0443 uk
50 | 0444 fert
51 | 0445 cherr
52 | 0446 zui
53 | 0447 tscherw
54 | 0448 scha
55 | 0449 schtscha
56 | 044a jerr
57 | 044e ju
58 | 044f ja
59 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/cyrillic/KOI8-R.unicodeMap:
--------------------------------------------------------------------------------
  1 | 000a 0a
  2 | 000c 000d 0c
  3 | 0020 007e 20
  4 | 00a0 9a
  5 | 00a9 bf
  6 | 00b0 9c
  7 | 00b2 9d
  8 | 00b7 9e
  9 | 00f7 9f
 10 | 02c6 5e
 11 | 02da 9c
 12 | 02dc 7e
 13 | 0401 b3
 14 | 0410 0411 e1
 15 | 0412 f7
 16 | 0413 e7
 17 | 0414 0415 e4
 18 | 0416 f6
 19 | 0417 fa
 20 | 0418 041f e9
 21 | 0420 0423 f2
 22 | 0424 e6
 23 | 0425 e8
 24 | 0426 e3
 25 | 0427 fe
 26 | 0428 fb
 27 | 0429 fd
 28 | 042a ff
 29 | 042b f9
 30 | 042c f8
 31 | 042d fc
 32 | 042e e0
 33 | 042f f1
 34 | 0430 0431 c1
 35 | 0432 d7
 36 | 0433 c7
 37 | 0434 0435 c4
 38 | 0436 d6
 39 | 0437 da
 40 | 0438 c9
 41 | 0439 043f ca
 42 | 0440 0443 d2
 43 | 0444 c6
 44 | 0445 c8
 45 | 0446 c3
 46 | 0447 de
 47 | 0448 db
 48 | 0449 dd
 49 | 044a df
 50 | 044b d9
 51 | 044c d8
 52 | 044d dc
 53 | 044e c0
 54 | 044f d1
 55 | 0451 a3
 56 | 2013 2d
 57 | 2014 2d2d
 58 | 2018 60
 59 | 2019 27
 60 | 201a 2c
 61 | 201c 22
 62 | 201d 22
 63 | 201e 2c2c
 64 | 2022 9e
 65 | 2026 2e2e2e
 66 | 2039 3c
 67 | 203a 3e
 68 | 2044 2f
 69 | 2122 544d
 70 | 2212 2d
 71 | 2219 221a 95
 72 | 2248 97
 73 | 2264 2265 98
 74 | 2320 93
 75 | 2321 9b
 76 | 2500 80
 77 | 2502 81
 78 | 250c 82
 79 | 2510 83
 80 | 2514 84
 81 | 2518 85
 82 | 251c 86
 83 | 2524 87
 84 | 252c 88
 85 | 2534 89
 86 | 253c 8a
 87 | 2550 2552 a0
 88 | 2553 2561 a4
 89 | 2562 256c b4
 90 | 2580 8b
 91 | 2584 8c
 92 | 2588 8d
 93 | 258c 8e
 94 | 2590 2593 8f
 95 | 25a0 94
 96 | fb00 6666
 97 | fb01 6669
 98 | fb02 666c
 99 | fb03 666669
100 | fb04 66666c
101 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/greek/Greek.nameToUnicode:
--------------------------------------------------------------------------------
 1 | 0396 Dzeta
 2 | 039e Ksi
 3 | 039f Omikron
 4 | 03a7 Khi
 5 | 03b2 betatwo
 6 | 03b6 dzeta
 7 | 03be ksi
 8 | 03bf omikron
 9 | 03c3 sigmafinal
10 | 03c6 phitwo
11 | 03c7 khi
12 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/greek/ISO-8859-7.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 a0
 5 | 00a3 a3
 6 | 00a6 00a9 a6
 7 | 00ab 00ad ab
 8 | 00b0 00b4 b0
 9 | 00b5 ec
10 | 00b7 b7
11 | 00bb bb
12 | 00bd bd
13 | 02c6 5e
14 | 02da b0
15 | 02dc 7e
16 | 0374 b4
17 | 037e 3b
18 | 0384 038a b4
19 | 038c bc
20 | 038e 03a1 be
21 | 03a3 03ce d3
22 | 03d0 e2
23 | 03d1 e8
24 | 03d2 d5
25 | 03d3 be
26 | 03d4 db
27 | 03d5 f6
28 | 03d6 f0
29 | 03d7 eae1e9
30 | 03da d3d4
31 | 03db f3f4
32 | 03f0 ea
33 | 03f1 f1
34 | 03f2 63
35 | 03f3 6a
36 | 03f4 c8
37 | 03f5 e5
38 | 2013 ad
39 | 2014 af
40 | 2018 60
41 | 2019 a2
42 | 201a 2c
43 | 201b a1
44 | 201c 22
45 | 201d 22
46 | 201e 2c2c
47 | 2022 b7
48 | 2026 2e2e2e
49 | 2039 3c
50 | 203a 3e
51 | 2044 2f
52 | 20ac c5f5f1fe
53 | 20af c4f1f7
54 | 2122 544d
55 | 2126 d9
56 | 2206 c4
57 | 2212 2d
58 | 2219 b7
59 | fb00 6666
60 | fb01 6669
61 | fb02 666c
62 | fb03 666669
63 | fb04 66666c
64 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/hebrew/ISO-8859-8.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 00a0 20
 5 | 00a2 00a9 a2
 6 | 00ab 00b9 ab
 7 | 00bb 00be bb
 8 | 010c 43
 9 | 010d 63
10 | 0131 69
11 | 0141 4c
12 | 0142 6c
13 | 0152 4f45
14 | 0153 6f65
15 | 0160 53
16 | 0161 73
17 | 0178 59
18 | 017d 5a
19 | 017e 7a
20 | 02c6 5e
21 | 02da b0
22 | 02dc 7e
23 | 05d0 05ea e0
24 | 05f0 e5e5
25 | 05f1 e5e9
26 | 05f2 e9e9
27 | 2013 ad
28 | 2014 2d2d
29 | 2018 60
30 | 2019 27
31 | 201a 2c
32 | 201c 22
33 | 201d 22
34 | 201e 2c2c
35 | 2022 b7
36 | 2026 2e2e2e
37 | 2039 3c
38 | 203a 3e
39 | 2044 2f
40 | 2122 544d
41 | 2212 2d
42 | f6f9 4c
43 | f6fa 4f45
44 | f6fc b0
45 | f6fd 53
46 | f6fe 7e
47 | f6ff 5a
48 | f721 21
49 | f724 24
50 | f726 26
51 | f730 f739 30
52 | f73f 3f
53 | f761 f77a 41
54 | f7a1 f7a2 a1
55 | f7bf bf
56 | f7e0 f7f6 c0
57 | f7f8 f7fe d8
58 | f7ff 59
59 | fb00 6666
60 | fb01 6669
61 | fb02 666c
62 | fb03 666669
63 | fb04 66666c
64 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/hebrew/Windows-1255.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 00a3 a0
 5 | 00a5 00a9 a5
 6 | 00ab 00b9 ab
 7 | 00bb 00bf bb
 8 | 00d7 aa
 9 | 00f7 ba
10 | 010c 43
11 | 010d 63
12 | 0131 69
13 | 0141 4c
14 | 0142 6c
15 | 0152 4f45
16 | 0153 6f65
17 | 0160 53
18 | 0161 73
19 | 0178 59
20 | 017d 5a
21 | 017e 7a
22 | 0192 83
23 | 02c6 88
24 | 02da b0
25 | 02dc 98
26 | 05b0 05b9 c0
27 | 05bb 05c3 cb
28 | 05f0 05f4 d4
29 | 05d0 05ea e0
30 | 200e 200f fd
31 | 2013 2014 96
32 | 2018 2019 91
33 | 201a 82
34 | 201c 201d 93
35 | 201e 84
36 | 2020 86
37 | 2021 87
38 | 2022 95
39 | 2026 85
40 | 2030 89
41 | 2039 8b
42 | 203a 9b
43 | 2044 2f
44 | 20aa a4
45 | 20ac 80
46 | 2122 99
47 | 2212 2d
48 | f6f9 4c
49 | f6fa 4f45
50 | f6fc b0
51 | f6fd 53
52 | f6fe 7e
53 | f6ff 5a
54 | f721 21
55 | f724 24
56 | f726 26
57 | f730 f739 30
58 | f73f 3f
59 | f761 f77a 41
60 | f7a1 f7a2 a1
61 | f7bf bf
62 | fb00 6666
63 | fb01 6669
64 | fb02 666c
65 | fb03 666669
66 | fb04 66666c
67 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/japanese/CMap/90pv-RKSJ-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap%%DocumentNeededResources: ProcSet (CIDInit)%%DocumentNeededResources: CMap (90pv-RKSJ-UCS2C)%%IncludeResource: ProcSet (CIDInit)%%IncludeResource: CMap (90pv-RKSJ-UCS2C)%%BeginResource: CMap (90pv-RKSJ-UCS2)%%Title: (90pv-RKSJ-UCS2)%%Version: 4.002%%Copyright: -----------------------------------------------------------%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.%%Copyright: All Rights Reserved.%%Copyright:%%Copyright: Patents Pending%%Copyright:%%Copyright: NOTICE: All information contained herein is the property%%Copyright: of Adobe Systems Incorporated.%%Copyright:%%Copyright: Permission is granted for redistribution of this file%%Copyright: provided this copyright notice is maintained intact and%%Copyright: that the contents of this file are not altered in any%%Copyright: way from its original form.%%Copyright:%%Copyright: PostScript and Display PostScript are trademarks of%%Copyright: Adobe Systems Incorporated which may be registered in%%Copyright: certain jurisdictions.%%Copyright: -----------------------------------------------------------%%EndComments/CIDInit /ProcSet findresource begin12 dict beginbegincmap/90pv-RKSJ-UCS2C usecmap/CIDSystemInfo 3 dict dup begin  /Registry (Adobe) def  /Ordering (90pv_RKSJ_UCS2) def  /Supplement 2 defend def/CMapName /90pv-RKSJ-UCS2 def/CMapVersion 4.002 def/CMapType 1 def/WMode 0 def18 beginbfrange<8591>	<8591>	<f8600030002e><85ab>	<85ab>	<f8620058004900490049><85ac>	<85ac>	<f861005800490056><85ad>	<85ad>	<f86000580056><85bf>	<85bf>	<f8620078006900690069><85c0>	<85c0>	<f861007800690076><85c1>	<85c1>	<f86000780076><865d>	<865d>	<f86000540042><869e>	<869e>	<f861004600410058><86d4>	<86d4>	<21e6f87a><86d5>	<86d5>	<21e7f87a><86d6>	<86d6>	<21e9f87a><86ce>	<86ce>	<f86021932191><8791>	<8791>	<592720dd><8792>	<8792>	<5c0f20dd><879d>	<879d>	<63a720dd><87fb>	<87fb>	<f862670996504f1a793e><87fc>	<87fc>	<f8628ca156e36cd54eba>endbfrangeendcmapCMapName currentdict /CMap defineresource popendend%%EndResource%%EOF


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/japanese/CMap/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 1990-2019 Adobe. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | 
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | 
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/korean/CMap/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 1990-2019 Adobe. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | 
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | 
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/latin2/Latin2.unicodeMap:
--------------------------------------------------------------------------------
  1 | 000a 000a 0a
  2 | 000c 000d 0c
  3 | 0020 007e 20
  4 | 00a0 00a0 20
  5 | 00a4 a4
  6 | 00a7 00a8 a7
  7 | 00ad ad
  8 | 00b0 b0
  9 | 00b4 b4
 10 | 00b8 b8
 11 | 00c1 00c2 c1
 12 | 00c4 c4
 13 | 00c7 c7
 14 | 00c9 c9
 15 | 00cb cb
 16 | 00cd 00ce cd
 17 | 00d3 00d4 d3
 18 | 00d6 00d7 d6
 19 | 00da da
 20 | 00dc 00dd dc
 21 | 00df df
 22 | 00e1 00e2 e1
 23 | 00e4 e4
 24 | 00e7 e7
 25 | 00e9 e9
 26 | 00eb eb
 27 | 00ed 00ee ed
 28 | 00f3 00f4 f3
 29 | 00f6 00f7 f6
 30 | 00fa fa
 31 | 00fc 00fd fc
 32 | 0102 c3
 33 | 0103 e3
 34 | 0104 a1
 35 | 0105 b1
 36 | 0106 c6
 37 | 0107 e6
 38 | 010c c8
 39 | 010d e8
 40 | 010e cf
 41 | 010f ef
 42 | 0110 d0
 43 | 0111 f0
 44 | 0118 ca
 45 | 0119 ea
 46 | 011a cc
 47 | 011b ec
 48 | 0131 69
 49 | 0139 c5
 50 | 013a e5
 51 | 013d a5
 52 | 013e b5
 53 | 0141 a3
 54 | 0142 b3
 55 | 0143 d1
 56 | 0144 f1
 57 | 0147 d2
 58 | 0148 f2
 59 | 0150 d5
 60 | 0151 f5
 61 | 0152 4f45
 62 | 0153 6f65
 63 | 0154 c0
 64 | 0155 e0
 65 | 0158 d8
 66 | 0159 f8
 67 | 015a a6
 68 | 015b b6
 69 | 015e aa
 70 | 015f ba
 71 | 0160 a9
 72 | 0161 b9
 73 | 0162 de
 74 | 0163 fe
 75 | 0164 ab
 76 | 0165 bb
 77 | 016e d9
 78 | 016f f9
 79 | 0170 db
 80 | 0171 fb
 81 | 0178 59
 82 | 0179 ac
 83 | 017a bc
 84 | 017b af
 85 | 017c bf
 86 | 017d ae
 87 | 017e be
 88 | 02c6 5e
 89 | 02c7 b7
 90 | 02d8 a2
 91 | 02d9 ff
 92 | 02da b0
 93 | 02db b2
 94 | 02dc 7e
 95 | 02dd bd
 96 | 2013 2013 ad
 97 | 2014 2014 2d2d
 98 | 2018 2018 60
 99 | 2019 2019 27
100 | 201a 201a 2c
101 | 201c 201c 22
102 | 201d 201d 22
103 | 201e 201e 2c2c
104 | 2022 2022 b7
105 | 2026 2026 2e2e2e
106 | 2039 2039 3c
107 | 203a 203a 3e
108 | 2044 2044 2f
109 | 2122 2122 544d
110 | 2212 2212 2d
111 | f6f9 f6f9 4c
112 | f6fa f6fa 4f45
113 | f6fc f6fc b0
114 | f6fd f6fd 53
115 | f6fe f6fe 7e
116 | f6ff f6ff 5a
117 | f721 f721 21
118 | f724 f724 24
119 | f726 f726 26
120 | f730 f739 30
121 | f73f f73f 3f
122 | f761 f77a 41
123 | f7a1 f7a2 a1
124 | f7bf f7bf bf
125 | f7e0 f7f6 c0
126 | f7f8 f7fe d8
127 | f7ff f7ff 59
128 | fb00 fb00 6666
129 | fb01 fb01 6669
130 | fb02 fb02 666c
131 | fb03 fb03 666669
132 | fb04 fb04 66666c
133 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/thai/TIS-620.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 20
 5 | 0131 69
 6 | 0141 4c
 7 | 0142 6c
 8 | 0152 4f45
 9 | 0153 6f65
10 | 0160 53
11 | 0161 73
12 | 0178 59
13 | 017d 5a
14 | 017e 7a
15 | 02c6 5e
16 | 02dc 7e
17 | 0e01 0e3a a1
18 | 0e3f 0e5b df
19 | 2013 2d2d
20 | 2014 2d2d
21 | 2018 60
22 | 2019 27
23 | 201a 2c
24 | 201c 22
25 | 201d 22
26 | 201e 2c2c
27 | 2022 2a
28 | 2026 2e2e2e
29 | 2039 3c
30 | 203a 3e
31 | 2044 2f
32 | 2122 544d
33 | 2212 2d
34 | f700 b0
35 | f701 f704 d4
36 | f705 f709 e8
37 | f70a f70e e8
38 | f70f ad
39 | f710 d1
40 | f711 ed
41 | f712 f717 e7
42 | f718 f71a d8
43 | fb00 6666
44 | fb01 6669
45 | fb02 666c
46 | fb03 666669
47 | fb04 66666c
48 | 


--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/turkish/ISO-8859-9.unicodeMap:
--------------------------------------------------------------------------------
 1 | 000a 0a
 2 | 000c 000d 0c
 3 | 0020 007e 20
 4 | 00a0 20
 5 | 00a1 00ac a1
 6 | 00ae 00cf ae
 7 | 00d1 00dc d1
 8 | 00df 00ef df
 9 | 00f1 00fc f1
10 | 00ff ff
11 | 010c 43
12 | 010d 63
13 | 011e d0
14 | 011f f0
15 | 0130 dd
16 | 0131 fd
17 | 0141 4c
18 | 0142 6c
19 | 0152 4f45
20 | 0153 6f65
21 | 015e de
22 | 015f fe
23 | 0160 53
24 | 0161 73
25 | 0178 59
26 | 017d 5a
27 | 017e 7a
28 | 02c6 5e
29 | 02da b0
30 | 02dc 7e
31 | 2013 ad
32 | 2014 2d2d
33 | 2018 60
34 | 2019 27
35 | 201a 2c
36 | 201c 22
37 | 201d 22
38 | 201e 2c2c
39 | 2022 b7
40 | 2026 2e2e2e
41 | 2039 3c
42 | 203a 3e
43 | 2044 2f
44 | 2122 544d
45 | 2212 2d
46 | f6f9 4c
47 | f6fa 4f45
48 | f6fc b0
49 | f6fd 53
50 | f6fe 7e
51 | f6ff 5a
52 | f721 21
53 | f724 24
54 | f726 26
55 | f730 f739 30
56 | f73f 3f
57 | f761 f77a 41
58 | f7a1 f7a2 a1
59 | f7bf bf
60 | f7e0 f7f6 c0
61 | f7f8 f7fe d8
62 | f7ff 59
63 | fb00 6666
64 | fb01 6669
65 | fb02 666c
66 | fb03 666669
67 | fb04 66666c
68 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/db/ExtractsToDB.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.db;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Path;
 5 | import java.nio.file.Paths;
 6 | 
 7 | import org.apache.tika.exception.TikaConfigException;
 8 | import org.apache.tika.pipes.pipesiterator.PipesIterator;
 9 | 
10 | public class ExtractsToDB {
11 | 
12 |     public static void main(String[] args) throws Exception {
13 |         Path tikaConfigFile = Paths.get(args[0]);
14 | 
15 |         PipesIterator it = PipesIterator.build(tikaConfigFile);
16 | 
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/db/FetchFilesFromDBPaths.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.db;
 2 | 
 3 | import java.io.InputStream;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Path;
 6 | import java.nio.file.Paths;
 7 | import java.nio.file.StandardCopyOption;
 8 | import java.util.regex.Matcher;
 9 | import java.util.regex.Pattern;
10 | 
11 | import org.apache.tika.metadata.Metadata;
12 | import org.apache.tika.pipes.FetchEmitTuple;
13 | import org.apache.tika.pipes.fetcher.Fetcher;
14 | import org.apache.tika.pipes.fetcher.FetcherManager;
15 | import org.apache.tika.pipes.pipesiterator.PipesIterator;
16 | 
17 | public class FetchFilesFromDBPaths {
18 | 
19 |     public static void main(String[] args) throws Exception {
20 |         Path tikaConfigFile = Paths.get("/Users/allison/Desktop/tika-config.xml");
21 |         PipesIterator pipesIterator = PipesIterator.build(tikaConfigFile);
22 |         Fetcher fetcher = FetcherManager.load(tikaConfigFile).getFetcher("s3f");
23 |         Path outputRoot = Paths.get("/Users/allison/Desktop/clam-pdfs");
24 | 
25 |         for (FetchEmitTuple t : pipesIterator) {
26 |             String clamav = t.getMetadata().get("clamav_detect");
27 |             Matcher m = Pattern.compile("([0-9a-f]{10,})").matcher(t.getFetchKey().getFetchKey());
28 |             String sha256 = "";
29 |             if (m.find()) {
30 |                 sha256 = m.group(1);
31 |             }
32 |             Path targ = outputRoot.resolve(clamav).resolve(sha256);
33 |             if (Files.isRegularFile(targ)) {
34 |                 continue;
35 |             }
36 |             Files.createDirectories(targ.getParent());
37 |             try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), new Metadata())) {
38 |                 Files.copy(is, targ, StandardCopyOption.REPLACE_EXISTING);
39 |             }
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/CSVLineCounter.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.digest;
 2 | 
 3 | import java.nio.charset.StandardCharsets;
 4 | import java.nio.file.Path;
 5 | import java.nio.file.Paths;
 6 | 
 7 | import org.apache.commons.csv.CSVFormat;
 8 | import org.apache.commons.csv.CSVParser;
 9 | import org.apache.commons.csv.CSVRecord;
10 | 
11 | public class CSVLineCounter {
12 | 
13 |     public static void main(String[] args) throws Exception {
14 |         Path path = Paths.get("/Users/allison/Desktop/size-pages-full.csv");
15 |         int c = 0;
16 |         for (CSVRecord r : CSVParser.parse(path, StandardCharsets.UTF_8, CSVFormat.EXCEL)) {
17 |             c++;
18 |         }
19 |         System.out.println(c);
20 | 
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/DigestChecker.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.digest;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.nio.charset.StandardCharsets;
 8 | import java.nio.file.Files;
 9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | import java.util.concurrent.atomic.AtomicInteger;
12 | 
13 | import org.apache.commons.codec.digest.DigestUtils;
14 | 
15 | public class DigestChecker {
16 | 
17 |     AtomicInteger totalChecked = new AtomicInteger(0);
18 |     public static void main(String[] args) throws Exception {
19 |         Path dir = Paths.get(args[0]);
20 |         try (BufferedWriter writer =
21 |                      Files.newBufferedWriter(Paths.get(args[1]), StandardCharsets.UTF_8)) {
22 |             DigestChecker digestChecker = new DigestChecker();
23 |             digestChecker.execute(dir, writer);
24 |         }
25 |     }
26 | 
27 |     private void execute(Path rootDir, BufferedWriter writer) {
28 |         processDir(rootDir, writer);
29 |         System.err.println("completed successfully");
30 |     }
31 | 
32 |     private void processDir(Path path, BufferedWriter writer) {
33 |         for (File f : path.toFile().listFiles()) {
34 |             if (f.isFile()) {
35 |                 processFile(f, writer);
36 |             } else {
37 |                 processDir(f.toPath(), writer);
38 |             }
39 |         }
40 |     }
41 | 
42 |     private void processFile(File f, BufferedWriter writer) {
43 |         String name = f.getName();
44 |         String digest = null;
45 |         try (InputStream is = Files.newInputStream(f.toPath())) {
46 |             digest = DigestUtils.sha256Hex(is);
47 |         } catch (IOException e) {
48 |             e.printStackTrace();
49 |         }
50 |         if (! name.equals(digest)) {
51 |             try {
52 |                 writer.write(name + "\t" + digest + "\n");
53 |             } catch (IOException e) {
54 |                 e.printStackTrace();
55 |             }
56 |         }
57 |         int checked = totalChecked.incrementAndGet();
58 |         if (checked % 1000 == 0) {
59 |             System.err.println(checked + " files processed");
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/FileListNormalizer.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.digest;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.File;
 6 | import java.nio.charset.StandardCharsets;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Path;
 9 | import java.nio.file.Paths;
10 | import java.util.regex.Matcher;
11 | import java.util.regex.Pattern;
12 | 
13 | public class FileListNormalizer {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 |         Path dir = Paths.get("PATH");
17 |         for (File f : dir.toFile().listFiles()) {
18 |             if (f.getName().endsWith("-normed.txt")) {
19 |                 continue;
20 |             }
21 |             Path output = dir.resolve(f.getName().replace(".txt", "-normed.txt"));
22 |             try (BufferedWriter w = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
23 |                 try (BufferedReader r = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
24 |                     String line = r.readLine();
25 |                     Matcher m =
26 |                             Pattern.compile("([a-f0-9]{2,2}/[a-f0-9]{2,2}/[a-f0-9]+)").matcher("");
27 |                     while (line != null) {
28 |                         m.reset(line);
29 |                         if (m.find()) {
30 |                             System.out.println(m.group(1));
31 |                             w.write(m.group(1) + "\n");
32 |                         } else {
33 |                             System.err.println("wtf: "+line);
34 |                         }
35 |                         line = r.readLine();
36 |                     }
37 |                 }
38 |             }
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/S3ListCompare.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.digest;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.Writer;
 5 | import java.nio.file.Files;
 6 | import java.nio.file.Path;
 7 | import java.nio.file.Paths;
 8 | import java.util.HashSet;
 9 | import java.util.Set;
10 | 
11 | import com.amazonaws.auth.AWSCredentialsProvider;
12 | import com.amazonaws.auth.profile.ProfileCredentialsProvider;
13 | import com.amazonaws.services.s3.AmazonS3;
14 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
15 | import com.amazonaws.services.s3.iterable.S3Objects;
16 | import com.amazonaws.services.s3.model.S3ObjectSummary;
17 | 
18 | public class S3ListCompare {
19 |     public static void main(String[] args) throws Exception {
20 |         Path pwd = Paths.get("");
21 |         Path oneMillion = pwd.resolve("");
22 |         Path s3 = pwd.resolve("s3-files.txt");
23 |         Set<String> eval = load(oneMillion);
24 |         Set<String> s3list = load(s3);
25 |         System.out.println(eval.size());
26 |         System.out.println(s3list.size());
27 |         int missing = 0;
28 |         for (String k : eval) {
29 |             if (! s3list.contains(k)) {
30 |                 System.out.println("file missing in s3: "+ k);
31 |                 missing++;
32 |             }
33 |         }
34 | 
35 |         System.out.println("missing: " + missing);
36 |     }
37 | 
38 |     private static Set<String> load(Path p) throws Exception {
39 |         Set<String> set = new HashSet<>();
40 |         try (BufferedReader r = Files.newBufferedReader(p)) {
41 |             String line = r.readLine();
42 |             while (line != null) {
43 |                 String[] bits = line.split("\\s+");
44 |                 String k = bits[0].trim();
45 |                 k = k.replaceFirst("", "");
46 |                 k = k.replaceFirst("", "");
47 |                 k = k.trim();
48 |                 set.add(k);
49 |                 line = r.readLine();
50 |             }
51 |         }
52 |         return set;
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/filter/CopyByMime.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.filter;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.nio.file.Files;
 6 | import java.nio.file.Path;
 7 | import java.nio.file.Paths;
 8 | 
 9 | import org.apache.tika.Tika;
10 | 
11 | public class CopyByMime {
12 | 
13 |     public static void main(String[] args) {
14 |         Path src = Paths.get(args[0]);
15 |         Path target = Paths.get(args[1]);
16 |         String mimePart = "nitf";
17 |         Tika tika = new Tika();
18 |         processDirectory(mimePart, src, src, target, tika);
19 | 
20 |     }
21 | 
22 |     private static void processDirectory(String mimePart, Path root, Path path, Path targetRoot,
23 |                                          Tika tika) {
24 |         for (File f : path.toFile().listFiles()) {
25 |             if (f.isDirectory()) {
26 |                 processDirectory(mimePart, root, f.toPath(), targetRoot, tika);
27 |             } else {
28 |                 processFile(mimePart, root, f.toPath(), targetRoot, tika);
29 |             }
30 |         }
31 |     }
32 | 
33 |     private static void processFile(String mimePart, Path root, Path path, Path targetRoot,
34 |                                     Tika tika) {
35 | 
36 |         try {
37 |             String type = tika.detect(path);
38 |             if (type.contains(mimePart)) {
39 |                 Path rel = root.relativize(path);
40 |                 Path target= targetRoot.resolve(rel);
41 |                 System.out.println(type + " : " + path);
42 |                 System.out.println(path + "-> " + target);
43 |                 if (!Files.isDirectory(target.getParent())) {
44 |                     Files.createDirectories(target.getParent());
45 |                 }
46 |                 Files.copy(path, target);
47 |             }
48 |         } catch (IOException e) {
49 |             e.printStackTrace();
50 |         }
51 | 
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/utils-general/src/test/java/org/tallison/pdf/utils/TestPDFSplitter.java:
--------------------------------------------------------------------------------
 1 | package org.tallison.pdf.utils;
 2 | 
 3 | 
 4 | import java.nio.file.Path;
 5 | import java.nio.file.Paths;
 6 | 
 7 | import org.junit.Ignore;
 8 | import org.junit.Test;
 9 | 
10 | 
11 | public class TestPDFSplitter {
12 | 
13 |     @Test
14 |     @Ignore
15 |     public void testSimple() throws Exception {
16 | 
17 |         PDFSplitter.main(new String[]{
18 |                 "/docs",
19 |                 "/single-pages",
20 |                 "10"});
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------