├── .gitignore
├── LICENSE.txt
├── README.md
├── batchlite
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── org
│ │ └── tallison
│ │ └── batchlite
│ │ ├── AbstractDirectoryProcessor.java
│ │ ├── AbstractFileProcessor.java
│ │ ├── CommandlineFileProcessor.java
│ │ ├── CommandlineFileToFileProcessor.java
│ │ ├── CommandlineStdoutToFileProcessor.java
│ │ ├── ConfigSrc.java
│ │ ├── FileProcessResult.java
│ │ ├── FileProcessor.java
│ │ ├── FileToFileProcessor.java
│ │ ├── MetadataWriter.java
│ │ ├── ProcessExecutor.java
│ │ ├── StreamEater.java
│ │ ├── example
│ │ ├── FileCommandExample.java
│ │ ├── PDFChecker.java
│ │ └── PDFStdoutChecker.java
│ │ └── writer
│ │ ├── CSVMetadataWriter.java
│ │ ├── JDBCMetadataWriter.java
│ │ ├── JSONMetadataWriter.java
│ │ ├── MetadataWriterFactory.java
│ │ ├── PathResultPair.java
│ │ └── WriterResult.java
│ └── resources
│ └── log4j2.xml
├── commoncrawl-fetcher
├── pom.xml
└── src
│ ├── README.txt
│ ├── main
│ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ ├── cc
│ │ │ ├── CCFileFetcher.java
│ │ │ ├── CCIndexReaderCounter.java
│ │ │ ├── CCIndexWGetter.java
│ │ │ ├── IndexGrep.java
│ │ │ ├── Refetcher.java
│ │ │ ├── S3IndexGetter.java
│ │ │ ├── fetcherlite
│ │ │ │ ├── CCFileFetcherLiteCLI.java
│ │ │ │ ├── FetchLiteRecordProcessor.java
│ │ │ │ ├── FetcherLiteConfig.java
│ │ │ │ └── FileFromCCWarcFetcher.java
│ │ │ ├── index
│ │ │ │ ├── AbstractRecordProcessor.java
│ │ │ │ ├── CCIndexRecord.java
│ │ │ │ ├── CCIndexWGetter.java
│ │ │ │ ├── CompositeRecordFilter.java
│ │ │ │ ├── IndexFileChecker.java
│ │ │ │ ├── IndexRecordProcessor.java
│ │ │ │ ├── LatLongAdder.java
│ │ │ │ ├── MimeCounter.java
│ │ │ │ ├── RecordFilter.java
│ │ │ │ └── db
│ │ │ │ │ ├── DBIndexer.java
│ │ │ │ │ └── DBIndexerCLI.java
│ │ │ └── pipes
│ │ │ │ └── CCIndexPipesIterator.java
│ │ │ └── util
│ │ │ ├── DBUtil.java
│ │ │ ├── HTTPFetchWrapper.java
│ │ │ ├── HostUpsert.java
│ │ │ ├── MapUtil.java
│ │ │ └── ReloadFetchStatusTable.java
│ └── resources
│ │ ├── log4j2.xml
│ │ ├── selectFetchAndFetchStatus.sql
│ │ ├── selectFilesToFetchFromCC.sql
│ │ ├── selectFilesToFetchPerWarcId.sql
│ │ ├── selectIndexedAndFetchedData.sql
│ │ ├── selectIndexedData.sql
│ │ └── selectWarcFileIdsToFetchFromCC.sql
│ └── test
│ ├── java
│ ├── CCIndexRecordTest.java
│ ├── CompositeRecordFilterTest.java
│ └── FetcherTest.java
│ └── resources
│ ├── examples
│ ├── mpeg-filters.json
│ ├── tika-config-fetch-fs.xml
│ ├── tika-config-index-fs.xml
│ ├── tika-config-index-s3.xml
│ └── tika-config-refetch-fs.xml
│ └── test-documents
│ ├── mime-filters-av.json
│ ├── mime-filters.json
│ ├── pdf-filter-sample.json
│ ├── pdf-filter.json
│ ├── status-filter.json
│ └── status-sample-filter.json
├── ingest-jdbc
├── pom.xml
└── src
│ └── main
│ └── java
│ └── org
│ └── tallison
│ └── ingest
│ └── arlington
│ └── ArlingtonIngest.java
├── ingest
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── ingest
│ │ │ ├── CompositeFeatureMapper.java
│ │ │ ├── FeatureMapper.java
│ │ │ ├── IngesterCLI.java
│ │ │ ├── IngesterToCSVCLI.java
│ │ │ ├── mappers
│ │ │ ├── ArlingtonMapper.java
│ │ │ ├── CPUMapper.java
│ │ │ ├── CaradocMapper.java
│ │ │ ├── ClamAVMapper.java
│ │ │ ├── ESUtil.java
│ │ │ ├── MultiCompareMapper.java
│ │ │ ├── MutoolMapper.java
│ │ │ ├── PDFBytesMapper.java
│ │ │ ├── PDFCheckerMapper.java
│ │ │ ├── PDFFontsMapper.java
│ │ │ ├── PDFInfoFeatureMapper.java
│ │ │ ├── PDFMinerMapper.java
│ │ │ ├── PDFResurrectMapper.java
│ │ │ ├── ProfileFeatureMapper.java
│ │ │ ├── QPDFFeatureMapper.java
│ │ │ ├── StatusFeatureMapper.java
│ │ │ ├── TikaFeatureMapper.java
│ │ │ ├── UniverseMapper.java
│ │ │ └── XPDFFontsMapper.java
│ │ │ ├── qpdf
│ │ │ ├── QPDFJsonExtractor.java
│ │ │ └── QPDFResults.java
│ │ │ ├── qpdf10
│ │ │ └── qpdf
│ │ │ │ ├── QPDFJsonExtractor.java
│ │ │ │ └── QPDFResults.java
│ │ │ └── utils
│ │ │ ├── CSVsToPostgres.java
│ │ │ ├── ESToCSV.java
│ │ │ └── FindMissing.java
│ └── resources
│ │ ├── META-INF
│ │ └── services
│ │ │ └── org.tallison.ingest.FeatureMapper
│ │ ├── common-keys.txt
│ │ ├── important-int-keys.txt
│ │ ├── log4j.properties
│ │ ├── observatory-mappings.json
│ │ ├── selectStar-dev.sql
│ │ ├── selectStar-lite.sql
│ │ ├── selectStar-minimal.sql
│ │ ├── selectStar-sample.sql
│ │ └── selectStar.sql
│ └── test
│ ├── java
│ └── org
│ │ └── tallison
│ │ └── ingest
│ │ └── mappers
│ │ ├── ArlingtonMapperTest.java
│ │ ├── MapperTest.java
│ │ ├── PDFCheckerMapperTest.java
│ │ ├── PDFFontsMapperTest.java
│ │ ├── PDFInfoMapperTest.java
│ │ ├── QPDF10JsonExtractorTest.java
│ │ ├── QPDFJsonExtractorTest.java
│ │ └── XPDFFontsMapperTest.java
│ └── resources
│ └── test-documents
│ ├── GHOSTSCRIPT-687771-0.pdf.json
│ ├── GHOSTSCRIPT-690371-0.pdf.json
│ ├── GHOSTSCRIPT-702993-0.pdf.json
│ ├── arlington
│ ├── GHOSTSCRIPT-687499-0.pdf.txt
│ ├── GHOSTSCRIPT-687647-0.pdf.txt
│ └── GHOSTSCRIPT-688076-1.pdf.txt
│ ├── pdfchecker
│ ├── GHOSTSCRIPT-696838-0.zip-0.pdf.json
│ └── fonts-PDFBOX-1002-2.pdf.json
│ ├── pdffonts
│ └── test-basic.txt
│ ├── qpdfv11
│ └── qpdf.json
│ ├── simple.json
│ ├── types.json
│ └── xpdffonts
│ └── test-basic.txt
├── pom.xml
├── simple-ingester
├── pom.xml
└── src
│ └── main
│ └── java
│ └── org
│ └── tallison
│ ├── ingester
│ └── IngesterCLI.java
│ └── tika
│ └── parser
│ ├── ConcatenatingParser.java
│ └── TikaServerClient.java
├── tika-addons
├── pom.xml
├── tika-eval-multicomparer
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── org
│ │ └── tallison
│ │ └── tika
│ │ └── eval
│ │ └── multi
│ │ ├── ListGenerator.java
│ │ ├── MultiCompareWorker.java
│ │ └── MultiComparerCLI.java
├── tika-pipes-reporter
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── org
│ │ └── tallison
│ │ └── tika
│ │ └── pipes
│ │ └── TikaPipesReporter.java
└── tika-server-fuzzer
│ ├── pom.xml
│ └── src
│ └── main
│ └── java
│ └── FuzzClient.java
├── tika-containers
├── pom.xml
├── tika-arlington
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── org
│ │ └── tallison
│ │ └── observatory
│ │ └── RegexCaptureParser.java
├── tika-exiftool
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ └── pom.xml
├── tika-pdfchecker
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ ├── pom.xml
│ ├── src
│ │ ├── main
│ │ │ ├── java
│ │ │ │ └── org
│ │ │ │ │ └── tallison
│ │ │ │ │ └── tika
│ │ │ │ │ └── parsers
│ │ │ │ │ └── pdfchecker
│ │ │ │ │ └── PDFChecker.java
│ │ │ └── resources
│ │ │ │ └── META-INF
│ │ │ │ └── services
│ │ │ │ └── org.apache.tika.parser.Parser
│ │ └── test
│ │ │ ├── java
│ │ │ └── TikaPDFToTextTest.java
│ │ │ └── resources
│ │ │ └── test-documents
│ │ │ └── testPDF.pdf
│ └── tika-server-core-2.0.0-SNAPSHOT.jar
├── tika-pdfium
│ └── my-args.gn
├── tika-pdfjs-selenium
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── FirefoxSeleniumExample.java
├── tika-pdfjs
│ ├── Dockerfile
│ ├── js
│ │ └── my-getinfo.js
│ ├── my-tika-config.xml
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── org
│ │ │ └── tallison
│ │ │ └── observatory
│ │ │ └── pdfjs
│ │ │ └── PDFJSOutputParser.java
│ │ └── test
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── observatory
│ │ │ └── pdfjs
│ │ │ └── PDFJSOutputParserTest.java
│ │ └── resources
│ │ └── test-documents
│ │ ├── test-basic.txt
│ │ ├── test-xmp.txt
│ │ └── test-xmp2.txt
├── tika-pdfspelunker
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ ├── java
│ │ │ └── org
│ │ │ │ └── tallison
│ │ │ │ └── tika
│ │ │ │ ├── parsers
│ │ │ │ ├── image
│ │ │ │ │ ├── ICCImageParser.java
│ │ │ │ │ └── IccMaxParser.java
│ │ │ │ └── pdf
│ │ │ │ │ ├── ImageGraphicsEngine.java
│ │ │ │ │ ├── PDFImageStreamUtil.java
│ │ │ │ │ ├── PDFSpelunker.java
│ │ │ │ │ └── ParseState.java
│ │ │ │ └── spelunker
│ │ │ │ └── tools
│ │ │ │ └── ExtractICCs.java
│ │ └── resources
│ │ │ ├── META-INF
│ │ │ └── services
│ │ │ │ └── org.apache.tika.parser.Parser
│ │ │ └── org
│ │ │ └── apache
│ │ │ └── tika
│ │ │ └── mime
│ │ │ └── custom-mimetypes.xml
│ │ └── test
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── tika
│ │ │ └── parsers
│ │ │ ├── image
│ │ │ └── ICCImageParserTest.java
│ │ │ └── pdf
│ │ │ └── PDFSpelunkerTest.java
│ │ └── resources
│ │ ├── config
│ │ └── my-tika-config.xml
│ │ └── test-documents
│ │ ├── baseball.jpg
│ │ ├── icc-reports
│ │ ├── non-compliant1.txt
│ │ ├── not-icc1.txt
│ │ └── not-icc2.txt
│ │ └── testPDF.pdf
├── tika-pdftotext
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ └── pom.xml
├── tika-pipes-pdfinfo
│ ├── Dockerfile
│ ├── log4j2.xml
│ ├── my-tika-config.xml
│ ├── pipes-log4j2.xml
│ └── pom.xml
├── tika-pipes-siegfried
│ ├── Dockerfile
│ ├── log4j2.xml
│ ├── my-tika-config.xml
│ ├── pipes-log4j2.xml
│ └── pom.xml
└── tika-pypdf2
│ ├── Dockerfile
│ ├── my-tika-config.xml
│ ├── pom.xml
│ └── scripts
│ └── PyPDF2Cli.py
├── tool-runners
├── arlington
│ ├── Dockerfile
│ ├── env.properties
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── arlington
│ │ │ └── TestGrammarRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── caradoc
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── caradoc
│ │ │ └── Caradoc.java
│ │ └── resources
│ │ └── log4j.properties
├── clamav
│ ├── Dockerfile
│ ├── conf
│ │ ├── clam.conf
│ │ └── freshclam.conf
│ ├── exec.sh
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── clamav
│ │ │ └── ClamAVRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── env.properties
├── fileprofiler
│ ├── Dockerfile
│ ├── README.txt
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── profiler
│ │ │ └── FileProfiler.java
│ │ └── resources
│ │ └── log4j2.xml
├── gstotext
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── gstotext
│ │ │ └── GhostScriptToTextRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── itext
│ ├── README.md
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ ├── java
│ │ │ └── org
│ │ │ │ └── tallison
│ │ │ │ └── tika
│ │ │ │ └── parser
│ │ │ │ └── itext
│ │ │ │ └── ITextParser.java
│ │ └── resources
│ │ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.tika.parser.Parser
│ │ └── test
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── tika
│ │ │ └── parser
│ │ │ └── itext
│ │ │ └── ITextParserTest.java
│ │ └── resources
│ │ └── test-documents
│ │ └── testPDF.pdf
├── mutoolclean
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── mutool
│ │ │ └── MutoolClean.java
│ │ └── resources
│ │ └── log4j.properties
├── mutooltext
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── mutool
│ │ │ └── MutoolTextRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfbytes
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── org
│ │ │ └── tallison
│ │ │ └── pdfutils
│ │ │ ├── PDFByteSniffer.java
│ │ │ ├── PDFVersionator.java
│ │ │ └── StreamSearcher.java
│ │ └── test
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── pdfutils
│ │ │ └── TestVersionUnpacker.java
│ │ └── resources
│ │ └── pdf-puzzle.pdf
├── pdfchecker
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfchecker
│ │ │ └── PDFCheckerRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfcpu
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfcpu
│ │ │ └── PDFCPURunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdffonts
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdffonts
│ │ │ └── PDFFontsRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfid
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfid
│ │ │ └── PDFIdRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfimages
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfimages
│ │ │ └── PDFImagesRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfinfo
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfinfo
│ │ │ └── PDFInfo.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfminerdump
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfminer
│ │ │ └── PDFMinerDump.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfminertext
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfminer
│ │ │ └── PDFMinerText.java
│ │ └── resources
│ │ └── log4j.properties
├── pdfresurrect
│ ├── Dockerfile
│ ├── env.properties
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdfresurrect
│ │ │ └── PDFResurrect.java
│ │ └── resources
│ │ └── log4j.properties
├── pdftoppm
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdftoppm
│ │ │ └── PDFToPPMRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdftops
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdftops
│ │ │ └── PDFToPSRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── pdftotext
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdftotext
│ │ │ └── PDFToTextRunner.java
│ │ └── resources
│ │ └── log4j.properties
├── polyfile
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── polyfile
│ │ │ ├── PolyFile.java
│ │ │ └── PolyFilePolyglot.java
│ │ └── resources
│ │ └── log4j.properties
├── pom.xml
├── qpdf
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── qpdf
│ │ │ └── QPDFToJson.java
│ │ └── resources
│ │ └── log4j.properties
├── tika-client
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── tika
│ │ │ └── client
│ │ │ ├── TikaClient.java
│ │ │ └── TikaLoadTester.java
│ │ └── resources
│ │ └── log4j2.xml
├── tika
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── tika
│ │ │ └── TikaBatch.java
│ │ └── resources
│ │ └── log4j2.xml
└── xpdffonts
│ ├── Dockerfile
│ ├── pom.xml
│ ├── src
│ └── main
│ │ ├── java
│ │ └── org
│ │ │ └── tallison
│ │ │ └── fileutils
│ │ │ └── pdffonts
│ │ │ └── PDFFontsRunner.java
│ │ └── resources
│ │ └── log4j.properties
│ ├── tgzs
│ ├── xpdf-arabic.tar.gz
│ ├── xpdf-arabic
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-chinese-simplified.tar.gz
│ ├── xpdf-chinese-simplified
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-chinese-traditional.tar.gz
│ ├── xpdf-chinese-traditional
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-cyrillic.tar.gz
│ ├── xpdf-cyrillic
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-greek.tar.gz
│ ├── xpdf-greek
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-hebrew.tar.gz
│ ├── xpdf-hebrew
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-japanese.tar.gz
│ ├── xpdf-japanese
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-korean.tar.gz
│ ├── xpdf-korean
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-latin2.tar.gz
│ ├── xpdf-latin2
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-t1fonts.tar.gz
│ ├── xpdf-t1fonts
│ │ ├── COPYING
│ │ ├── README
│ │ ├── d050000l.pfb
│ │ └── s050000l.pfb
│ ├── xpdf-thai.tar.gz
│ ├── xpdf-thai
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf-turkish.tar.gz
│ └── xpdf-turkish
│ │ ├── README
│ │ └── add-to-xpdfrc
│ ├── xpdf
│ ├── arabic
│ │ └── ISO-8859-6.unicodeMap
│ ├── chinese-simplified
│ │ ├── Adobe-GB1.cidToUnicode
│ │ ├── CMap
│ │ │ ├── Adobe-GB1-0
│ │ │ ├── Adobe-GB1-1
│ │ │ ├── Adobe-GB1-2
│ │ │ ├── Adobe-GB1-3
│ │ │ ├── Adobe-GB1-4
│ │ │ ├── Adobe-GB1-5
│ │ │ ├── Adobe-GB1-UCS2
│ │ │ ├── GB-EUC-H
│ │ │ ├── GB-EUC-V
│ │ │ ├── GB-H
│ │ │ ├── GB-V
│ │ │ ├── GBK-EUC-H
│ │ │ ├── GBK-EUC-UCS2
│ │ │ ├── GBK-EUC-V
│ │ │ ├── GBK2K-H
│ │ │ ├── GBK2K-V
│ │ │ ├── GBKp-EUC-H
│ │ │ ├── GBKp-EUC-V
│ │ │ ├── GBT-EUC-H
│ │ │ ├── GBT-EUC-V
│ │ │ ├── GBT-H
│ │ │ ├── GBT-V
│ │ │ ├── GBTpc-EUC-H
│ │ │ ├── GBTpc-EUC-V
│ │ │ ├── GBpc-EUC-H
│ │ │ ├── GBpc-EUC-UCS2
│ │ │ ├── GBpc-EUC-UCS2C
│ │ │ ├── GBpc-EUC-V
│ │ │ ├── LICENSE.md
│ │ │ ├── UniGB-UCS2-H
│ │ │ ├── UniGB-UCS2-V
│ │ │ ├── UniGB-UTF16-H
│ │ │ ├── UniGB-UTF16-V
│ │ │ ├── UniGB-UTF32-H
│ │ │ ├── UniGB-UTF32-V
│ │ │ ├── UniGB-UTF8-H
│ │ │ └── UniGB-UTF8-V
│ │ ├── EUC-CN.unicodeMap
│ │ ├── GBK.unicodeMap
│ │ └── ISO-2022-CN.unicodeMap
│ ├── chinese-traditional
│ │ ├── Adobe-CNS1.cidToUnicode
│ │ ├── Big5.unicodeMap
│ │ ├── Big5ascii.unicodeMap
│ │ └── CMap
│ │ │ ├── Adobe-CNS1-0
│ │ │ ├── Adobe-CNS1-1
│ │ │ ├── Adobe-CNS1-2
│ │ │ ├── Adobe-CNS1-3
│ │ │ ├── Adobe-CNS1-4
│ │ │ ├── Adobe-CNS1-5
│ │ │ ├── Adobe-CNS1-6
│ │ │ ├── Adobe-CNS1-7
│ │ │ ├── Adobe-CNS1-UCS2
│ │ │ ├── B5-H
│ │ │ ├── B5-V
│ │ │ ├── B5pc-H
│ │ │ ├── B5pc-UCS2
│ │ │ ├── B5pc-UCS2C
│ │ │ ├── B5pc-V
│ │ │ ├── CNS-EUC-H
│ │ │ ├── CNS-EUC-V
│ │ │ ├── CNS1-H
│ │ │ ├── CNS1-V
│ │ │ ├── CNS2-H
│ │ │ ├── CNS2-V
│ │ │ ├── ETHK-B5-H
│ │ │ ├── ETHK-B5-V
│ │ │ ├── ETen-B5-H
│ │ │ ├── ETen-B5-UCS2
│ │ │ ├── ETen-B5-V
│ │ │ ├── ETenms-B5-H
│ │ │ ├── ETenms-B5-V
│ │ │ ├── HKdla-B5-H
│ │ │ ├── HKdla-B5-V
│ │ │ ├── HKdlb-B5-H
│ │ │ ├── HKdlb-B5-V
│ │ │ ├── HKgccs-B5-H
│ │ │ ├── HKgccs-B5-V
│ │ │ ├── HKm314-B5-H
│ │ │ ├── HKm314-B5-V
│ │ │ ├── HKm471-B5-H
│ │ │ ├── HKm471-B5-V
│ │ │ ├── HKscs-B5-H
│ │ │ ├── HKscs-B5-V
│ │ │ ├── LICENSE.md
│ │ │ ├── UniCNS-UCS2-H
│ │ │ ├── UniCNS-UCS2-V
│ │ │ ├── UniCNS-UTF16-H
│ │ │ ├── UniCNS-UTF16-V
│ │ │ ├── UniCNS-UTF32-H
│ │ │ ├── UniCNS-UTF32-V
│ │ │ ├── UniCNS-UTF8-H
│ │ │ └── UniCNS-UTF8-V
│ ├── cyrillic
│ │ ├── Bulgarian.nameToUnicode
│ │ └── KOI8-R.unicodeMap
│ ├── greek
│ │ ├── Greek.nameToUnicode
│ │ └── ISO-8859-7.unicodeMap
│ ├── hebrew
│ │ ├── ISO-8859-8.unicodeMap
│ │ └── Windows-1255.unicodeMap
│ ├── japanese
│ │ ├── Adobe-Japan1.cidToUnicode
│ │ ├── CMap
│ │ │ ├── 78-EUC-H
│ │ │ ├── 78-EUC-V
│ │ │ ├── 78-H
│ │ │ ├── 78-RKSJ-H
│ │ │ ├── 78-RKSJ-V
│ │ │ ├── 78-V
│ │ │ ├── 78ms-RKSJ-H
│ │ │ ├── 78ms-RKSJ-V
│ │ │ ├── 83pv-RKSJ-H
│ │ │ ├── 90ms-RKSJ-H
│ │ │ ├── 90ms-RKSJ-UCS2
│ │ │ ├── 90ms-RKSJ-V
│ │ │ ├── 90msp-RKSJ-H
│ │ │ ├── 90msp-RKSJ-V
│ │ │ ├── 90pv-RKSJ-H
│ │ │ ├── 90pv-RKSJ-UCS2
│ │ │ ├── 90pv-RKSJ-UCS2C
│ │ │ ├── 90pv-RKSJ-V
│ │ │ ├── Add-H
│ │ │ ├── Add-RKSJ-H
│ │ │ ├── Add-RKSJ-V
│ │ │ ├── Add-V
│ │ │ ├── Adobe-Japan1-0
│ │ │ ├── Adobe-Japan1-1
│ │ │ ├── Adobe-Japan1-2
│ │ │ ├── Adobe-Japan1-3
│ │ │ ├── Adobe-Japan1-4
│ │ │ ├── Adobe-Japan1-5
│ │ │ ├── Adobe-Japan1-6
│ │ │ ├── Adobe-Japan1-7
│ │ │ ├── Adobe-Japan1-UCS2
│ │ │ ├── EUC-H
│ │ │ ├── EUC-V
│ │ │ ├── Ext-H
│ │ │ ├── Ext-RKSJ-H
│ │ │ ├── Ext-RKSJ-V
│ │ │ ├── Ext-V
│ │ │ ├── H
│ │ │ ├── Hankaku
│ │ │ ├── Hiragana
│ │ │ ├── Katakana
│ │ │ ├── LICENSE.md
│ │ │ ├── NWP-H
│ │ │ ├── NWP-V
│ │ │ ├── RKSJ-H
│ │ │ ├── RKSJ-V
│ │ │ ├── Roman
│ │ │ ├── UniJIS-UCS2-H
│ │ │ ├── UniJIS-UCS2-HW-H
│ │ │ ├── UniJIS-UCS2-HW-V
│ │ │ ├── UniJIS-UCS2-V
│ │ │ ├── UniJIS-UTF16-H
│ │ │ ├── UniJIS-UTF16-V
│ │ │ ├── UniJIS-UTF32-H
│ │ │ ├── UniJIS-UTF32-V
│ │ │ ├── UniJIS-UTF8-H
│ │ │ ├── UniJIS-UTF8-V
│ │ │ ├── UniJIS2004-UTF16-H
│ │ │ ├── UniJIS2004-UTF16-V
│ │ │ ├── UniJIS2004-UTF32-H
│ │ │ ├── UniJIS2004-UTF32-V
│ │ │ ├── UniJIS2004-UTF8-H
│ │ │ ├── UniJIS2004-UTF8-V
│ │ │ ├── UniJISPro-UCS2-HW-V
│ │ │ ├── UniJISPro-UCS2-V
│ │ │ ├── UniJISPro-UTF8-V
│ │ │ ├── UniJISX0213-UTF32-H
│ │ │ ├── UniJISX0213-UTF32-V
│ │ │ ├── UniJISX02132004-UTF32-H
│ │ │ ├── UniJISX02132004-UTF32-V
│ │ │ ├── V
│ │ │ └── WP-Symbol
│ │ ├── EUC-JP.unicodeMap
│ │ ├── ISO-2022-JP.unicodeMap
│ │ └── Shift-JIS.unicodeMap
│ ├── korean
│ │ ├── Adobe-KR.cidToUnicode
│ │ ├── Adobe-Korea1.cidToUnicode
│ │ ├── CMap
│ │ │ ├── Adobe-KR-0
│ │ │ ├── Adobe-KR-1
│ │ │ ├── Adobe-KR-2
│ │ │ ├── Adobe-KR-3
│ │ │ ├── Adobe-KR-4
│ │ │ ├── Adobe-KR-5
│ │ │ ├── Adobe-KR-6
│ │ │ ├── Adobe-KR-7
│ │ │ ├── Adobe-KR-8
│ │ │ ├── Adobe-KR-9
│ │ │ ├── Adobe-Korea1-0
│ │ │ ├── Adobe-Korea1-1
│ │ │ ├── Adobe-Korea1-2
│ │ │ ├── Adobe-Korea1-UCS2
│ │ │ ├── KSC-EUC-H
│ │ │ ├── KSC-EUC-V
│ │ │ ├── KSC-H
│ │ │ ├── KSC-Johab-H
│ │ │ ├── KSC-Johab-V
│ │ │ ├── KSC-V
│ │ │ ├── KSCms-UHC-H
│ │ │ ├── KSCms-UHC-HW-H
│ │ │ ├── KSCms-UHC-HW-V
│ │ │ ├── KSCms-UHC-UCS2
│ │ │ ├── KSCms-UHC-V
│ │ │ ├── KSCpc-EUC-H
│ │ │ ├── KSCpc-EUC-UCS2
│ │ │ ├── KSCpc-EUC-UCS2C
│ │ │ ├── KSCpc-EUC-V
│ │ │ ├── LICENSE.md
│ │ │ ├── UniAKR-UTF16-H
│ │ │ ├── UniAKR-UTF32-H
│ │ │ ├── UniAKR-UTF8-H
│ │ │ ├── UniKS-UCS2-H
│ │ │ ├── UniKS-UCS2-V
│ │ │ ├── UniKS-UTF16-H
│ │ │ ├── UniKS-UTF16-V
│ │ │ ├── UniKS-UTF32-H
│ │ │ ├── UniKS-UTF32-V
│ │ │ ├── UniKS-UTF8-H
│ │ │ └── UniKS-UTF8-V
│ │ └── ISO-2022-KR.unicodeMap
│ ├── latin2
│ │ └── Latin2.unicodeMap
│ ├── thai
│ │ ├── TIS-620.unicodeMap
│ │ └── Thai.nameToUnicode
│ └── turkish
│ │ └── ISO-8859-9.unicodeMap
│ └── xpdfrc
└── utils-general
├── pom.xml
└── src
├── main
└── java
│ └── org
│ └── tallison
│ ├── db
│ ├── CustomCSVToPG.java
│ ├── ExtractsToDB.java
│ ├── FetchFilesFromDBPaths.java
│ └── PGToCSV.java
│ ├── digest
│ ├── CSVLineCounter.java
│ ├── CompareLists.java
│ ├── DigestChecker.java
│ ├── FileListNormalizer.java
│ ├── RemoveExtras.java
│ ├── S3Compare.java
│ ├── S3DigestChecker.java
│ └── S3ListCompare.java
│ ├── filter
│ ├── CopyByMime.java
│ └── CopyFilterDigest.java
│ └── pdf
│ └── utils
│ └── PDFSplitter.java
└── test
└── java
└── org
└── tallison
└── pdf
└── utils
└── TestPDFSplitter.java
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | *.iml
4 | /tool-runners/pdfchecker/pdf-checker.tgz
5 | /tool-runners/arlington
6 | /tool-runners/arlington/grammar/
7 | /tika-containers/tika-pdfchecker/pdf-checker.tgz
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # File Observatory
2 | This repo hosts development code used on the backend to support data ingestion into
3 | an ElasticSearch index for the [SafeDocs File Observatory app](https://github.com/jpl-safedocs).
4 |
5 | This repo contains pre-ALPHA grade code for demonstration purposes only.
6 |
7 | Some capabilities demonstrated within have been integrated into Apache Tika.
8 | Some have been spun off into standalone projects, e.g. [commoncrawl-fetcher-lite](https://github.com/tballison/commoncrawl-fetcher-lite).
9 |
10 | # Attribution
11 | The commoncrawl-fetcher module includes code that relies on GeoLite2 data created by MaxMind, available from
12 | [https://www.maxmind.com](https://www.maxmind.com).
--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/JSONMetadataWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.batchlite.writer;
18 |
19 | import com.google.gson.Gson;
20 | import org.tallison.batchlite.MetadataWriter;
21 |
22 | import java.io.IOException;
23 | import java.nio.charset.StandardCharsets;
24 | import java.nio.file.Files;
25 | import java.nio.file.Path;
26 |
27 | public class JSONMetadataWriter extends MetadataWriter {
28 |
29 | private final static Gson GSON = new Gson();
30 |
31 | private final Path metadataRootDir;
32 |
33 | public JSONMetadataWriter(String name,
34 | Path metadataRootDir, int stdoutLimit, int stderrLimit) {
35 | super(name, stdoutLimit, stderrLimit);
36 | this.metadataRootDir = metadataRootDir;
37 | }
38 |
39 | @Override
40 | protected void write(PathResultPair pair) throws IOException {
41 | Path target = metadataRootDir.resolve(pair.getRelPath() + ".json");
42 | if (! Files.isDirectory(target.getParent())) {
43 | Files.createDirectories(target.getParent());
44 | }
45 | Files.write(target, GSON.toJson(pair.getResult()).getBytes(StandardCharsets.UTF_8));
46 | }
47 |
48 | @Override
49 | public void close() throws IOException {
50 | //no-op
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/MetadataWriterFactory.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.batchlite.writer;
18 |
19 | import org.tallison.batchlite.MetadataWriter;
20 |
21 | import java.io.IOException;
22 | import java.nio.file.Paths;
23 |
24 | public class MetadataWriterFactory {
25 |
26 | public static MetadataWriter build(String name, String writerString,
27 | boolean isDelta,
28 | int maxStdout, int maxStderr) throws IOException {
29 | if (writerString.startsWith("jdbc:")) {
30 | return new JDBCMetadataWriter(name, writerString, isDelta, maxStdout, maxStderr);
31 | } else if (writerString.endsWith(".csv") || writerString.endsWith(".tsv")) {
32 | return new CSVMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr);
33 | } else {
34 | return new JSONMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr);
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/PathResultPair.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.batchlite.writer;
18 |
19 | import org.tallison.batchlite.FileProcessResult;
20 |
21 | import java.nio.file.Path;
22 |
23 | public class PathResultPair {
24 |
25 |
26 |
27 | private final String relPath;
28 | private final FileProcessResult result;
29 |
30 | public PathResultPair(String relPath, FileProcessResult result) {
31 | this.relPath = relPath;
32 | this.result = result;
33 | }
34 |
35 | public String getRelPath() {
36 | return relPath;
37 | }
38 |
39 | public FileProcessResult getResult() {
40 | return result;
41 | }
42 |
43 | @Override
44 | public String toString() {
45 | return "PathResultPair{" +
46 | "relPath='" + relPath + '\'' +
47 | ", result=" + result +
48 | '}';
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/batchlite/src/main/java/org/tallison/batchlite/writer/WriterResult.java:
--------------------------------------------------------------------------------
1 | package org.tallison.batchlite.writer;
2 |
3 | public class WriterResult {
4 |
5 | private final int recordsWritten;
6 | public WriterResult(int recordsWritten) {
7 | this.recordsWritten = recordsWritten;
8 | }
9 | public int getRecordsWritten() {
10 | return recordsWritten;
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/batchlite/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/README.txt:
--------------------------------------------------------------------------------
1 | This is a set of utilities for extracting files from Common Crawl.
2 |
3 | The assumption is that you don't have direct access to S3 and you
4 | need to pull data.
5 |
6 | Step 1:
7 | * Download the 300 index .gz files (this is normally ~1 TB)
8 |
9 | Step 2:
10 | * Read through the .gz files and index into postgres those files that
11 | meet certain criteria (maybe just PDFs, etc)
12 |
13 | Step 3:
14 | * Based on the records in the database, request the warc file from AWS for
15 | each file
16 | * Extract the literal bytes from that file and index some more data from the warc
17 |
18 | Step 4:
19 | * For each file that CC identified as truncated, go back to the original URL and try
20 | to retrieve the file from there.
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/CCIndexReaderCounter.java:
--------------------------------------------------------------------------------
1 | package org.tallison.cc;
2 |
3 | import java.util.concurrent.atomic.AtomicLong;
4 |
5 | public class CCIndexReaderCounter {
6 | AtomicLong recordsRead = new AtomicLong(0);
7 | AtomicLong filesExtracted = new AtomicLong(0);
8 | AtomicLong truncatedWritten = new AtomicLong(0);
9 |
10 | public AtomicLong getRecordsRead() {
11 | return recordsRead;
12 | }
13 |
14 | public AtomicLong getFilesExtracted() {
15 | return filesExtracted;
16 | }
17 |
18 | public AtomicLong getTruncatedWritten() {
19 | return truncatedWritten;
20 | }
21 |
22 | @Override
23 | public String toString() {
24 | return "CCIndexReaderCounter{" +
25 | "recordsRead=" + recordsRead +
26 | ", filesExtracted=" + filesExtracted +
27 | ", truncatedWritten=" + truncatedWritten +
28 | '}';
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/index/IndexRecordProcessor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.tallison.cc.index;
19 |
20 |
21 | import java.io.IOException;
22 |
23 | public interface IndexRecordProcessor {
24 |
25 | public void init(String[] args) throws Exception;
26 |
27 | public boolean process(String json) throws IOException;
28 |
29 | public void close() throws IOException;
30 | }
31 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/cc/index/RecordFilter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.cc.index;
18 |
19 | public interface RecordFilter {
20 |
21 | boolean accept(CCIndexRecord record);
22 | }
23 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/util/MapUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.util;
18 |
19 | import java.util.Collections;
20 | import java.util.Comparator;
21 | import java.util.LinkedHashMap;
22 | import java.util.LinkedList;
23 | import java.util.List;
24 | import java.util.Map;
25 |
26 | public class MapUtil {
27 | public static ,
28 | V extends Comparable super V>> Map sortByDescendingValue(Map map ) {
29 | List> list =
30 | new LinkedList<>( map.entrySet() );
31 | Collections.sort( list, new Comparator>() {
32 | @Override
33 | public int compare(Map.Entry o1, Map.Entry o2 )
34 | {
35 | int c = o2.getValue().compareTo(o1.getValue());
36 | if (c == 0) {
37 | return o1.getKey().compareTo(o2.getKey());
38 | }
39 | return c;
40 | }
41 | } );
42 |
43 | Map result = new LinkedHashMap<>();
44 | for (Map.Entry entry : list)
45 | {
46 | result.put( entry.getKey(), entry.getValue() );
47 | }
48 | return result;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/java/org/tallison/util/ReloadFetchStatusTable.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.tallison.util;
18 |
19 | import java.sql.Connection;
20 | import java.sql.DriverManager;
21 | import java.sql.Statement;
22 |
23 | import org.tallison.cc.CCFileFetcher;
24 |
25 | /**
26 | * For dev use only. This loads a new status table for when there are changes
27 | * to CCFileFetcher.STATUS
28 | */
29 | public class ReloadFetchStatusTable {
30 |
31 | public static void main(String[] args) throws Exception {
32 | Connection connection = DriverManager.getConnection(args[0]);
33 | try (Statement st = connection.createStatement()) {
34 | String sql = "drop table if exists cc_fetch_status";
35 | st.execute(sql);
36 |
37 | sql = "create table cc_fetch_status " + "(id integer primary key, status varchar(64));";
38 | st.execute(sql);
39 |
40 |
41 | for (CCFileFetcher.FETCH_STATUS status : CCFileFetcher.FETCH_STATUS.values()) {
42 |
43 | sql = "insert into cc_fetch_status values (" + status.ordinal() + ",'" +
44 | status.name() + "');";
45 | st.execute(sql);
46 | }
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFetchAndFetchStatus.sql:
--------------------------------------------------------------------------------
1 | select f.id, f.fetched_digest, f.fetched_length, s.status
2 | from cc_fetch f
3 | join cc_fetch_status s on f.status_id=s.id
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFilesToFetchFromCC.sql:
--------------------------------------------------------------------------------
1 | --limited query used to pull enough info back to
2 | --extract the right files from common crawl's warc files
3 | select u.id,
4 | digest as cc_index_digest,
5 | w.name as warc_file_name,
6 | warc_offset, warc_length
7 | from cc_urls u
8 | join cc_warc_file_name w on u.warc_file_name = w.id
9 | join cc_truncated t on u.truncated = t.id
10 | left join cc_fetch f on f.id = u.id
11 | where f.id is null and u.status = 200 and length(t.name) = 0
12 | order by w.name, warc_offset
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectFilesToFetchPerWarcId.sql:
--------------------------------------------------------------------------------
1 | select u.id,
2 | digest as cc_index_digest,
3 | w.name as warc_file_name,
4 | warc_offset, warc_length,
5 | t.name as cc_truncated
6 | from cc_urls u
7 | join cc_warc_file_name w on u.warc_file_name = w.id
8 | join cc_truncated t on u.truncated = t.id
9 | left join cc_fetch f on f.id = u.id
10 | where f.id is null and u.status = 200
11 | order by w.id, warc_offset
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectIndexedAndFetchedData.sql:
--------------------------------------------------------------------------------
1 | --full query of the useful information gathered
2 | --from the indices
3 | select u.id, url,
4 | digest as cc_index_digest,
5 | f.fetched_digest,
6 | u.status as http_status,
7 | m.name as mime,
8 | dm.name as detected_mime,
9 | t.name as truncated,
10 | w.name as warc_file_name,
11 | warc_offset, warc_length,
12 | l.name as languages,
13 | f.fetched_length,
14 | s.status as fetched_status
15 | from cc_urls u
16 | join cc_warc_file_name w on u.warc_file_name = w.id
17 | join cc_mimes m on u.mime = m.id
18 | join cc_detected_mimes dm on u.detected_mime=dm.id
19 | join cc_truncated t on u.truncated = t.id
20 | join cc_languages l on u.languages = l.id
21 | left join cc_fetch f on f.id=u.id
22 | left join cc_fetch_status s on f.status_id=s.id
23 | where u.status = 200 and length(t.name) = 0
24 | order by w.name, warc_offset
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectIndexedData.sql:
--------------------------------------------------------------------------------
1 | --full query of the useful information gathered
2 | --from the indices
3 | select u.id, url,
4 | digest as cc_index_digest,
5 | status as http_status,
6 | m.name as mime,
7 | dm.name as detected_mime,
8 | t.name as truncated,
9 | w.name as warc_file_name,
10 | warc_offset, warc_length,
11 | l.name as languages
12 | from cc_urls u
13 | join cc_warc_file_name w on u.warc_file_name = w.id
14 | join cc_mimes m on u.mime = m.id
15 | join cc_detected_mimes dm on u.detected_mime=dm.id
16 | join cc_truncated t on u.truncated = t.id
17 | join cc_languages l on u.languages = l.id
18 | where status = 200 and length(t.name) = 0
19 | order by w.name, warc_offset
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/main/resources/selectWarcFileIdsToFetchFromCC.sql:
--------------------------------------------------------------------------------
1 | select w.id
2 | from cc_warc_file_name w
3 | order by w.id
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/mpeg-filters.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": [200,300,400],
3 | "exact" : {
4 | "detected_mimes": [
5 | "video/mp4",
6 | "video/quicktime"
7 | ],
8 | "case_sensitive" : false
9 | }
10 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-fetch-fs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | id
5 | warc_file_name
6 | warc_offset
7 | warc_end_offset
8 | hf
9 | fse
10 | jdbc:sqlite:/Users/allison/Desktop/demo-backup.db
11 |
13 |
16 |
31 |
32 |
33 |
34 |
35 |
36 | hf
37 |
38 |
39 |
40 |
41 |
42 |
43 | fse
44 | /Users/allison/data/cc/docs
45 | skip
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-index-fs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | fs1
5 | /Users/allison/data/cc/CC-MAIN-2022-27
6 |
7 |
8 |
9 |
10 |
11 | fs1
12 | /Users/allison/data/cc/CC-MAIN-2022-27
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-index-s3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | fs1
5 |
6 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-27/cc-index.paths.gz
7 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-21/cc-index.paths.gz
8 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz
9 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-49/cc-index.paths.gz
10 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-43/cc-index.paths.gz
11 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-39/cc-index.paths.gz
12 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-31/cc-index.paths.gz
13 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-25/cc-index.paths.gz
14 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-21/cc-index.paths.gz
15 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-17/cc-index.paths.gz
16 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/cc-index.paths.gz
17 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/cc-index.paths.gz
18 |
19 |
20 |
21 |
22 |
23 |
24 | fs1
25 | commoncrawl
26 | profile
27 | saml-pub
28 | us-east-1
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/examples/tika-config-refetch-fs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hf
6 | 10
7 |
8 | 10000000000
9 | 300000
10 |
11 |
12 |
13 |
14 |
15 |
16 | fse
17 | /Users/allison/data/cc/docs/CC-MAIN-2022-27
18 | skip
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/mime-filters-av.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": [200,300,400],
3 | "regex" : {
4 | "mimes": [
5 | "\\Aaudio",
6 | "\\Avideo"
7 | ],
8 | "detected_mimes": [
9 | "\\Aaudio",
10 | "\\Avideo"
11 | ]
12 | }
13 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/mime-filters.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": [200,300,400],
3 | "exact" : {
4 | "mimes": [
5 | "application/pdf"
6 | ],
7 | "detected_mimes": [
8 | "application/pdf"
9 | ],
10 | "case_sensitive" : false
11 | },
12 | "regex" : {
13 | "mimes": [
14 | "(?i)pdf\\Z"
15 | ],
16 | "detected_mimes": [
17 | "(?i)pdf\\Z"
18 | ]
19 | }
20 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter-sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "exact" : {
3 | "mimes": [
4 | {"pattern": "application/pdf", "probability": 0.1}
5 | ],
6 | "detected_mimes": [
7 | {"pattern": "application/pdf", "probability": 0.1}
8 | ],
9 | "case_sensitive" : false
10 | },
11 | "status": 200
12 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter.json:
--------------------------------------------------------------------------------
1 | {
2 | "exact" : {
3 | "mimes": [
4 | "application/pdf"
5 | ],
6 | "detected_mimes": [
7 | "application/pdf"
8 | ],
9 | "case_sensitive" : false
10 | },
11 | "defaultInclude": false
12 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/status-filter.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": 200
3 | }
--------------------------------------------------------------------------------
/commoncrawl-fetcher/src/test/resources/test-documents/status-sample-filter.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": 200,
3 | "probability": 0.001
4 | }
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/CompositeFeatureMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest;
2 |
3 | import org.apache.tika.config.ServiceLoader;
4 | import org.apache.tika.pipes.fetcher.Fetcher;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.List;
11 | import java.util.Map;
12 |
13 | public class CompositeFeatureMapper implements FeatureMapper {
14 | private static final ServiceLoader DEFAULT_LOADER =
15 | new ServiceLoader(FeatureMapper.class.getClassLoader());
16 |
17 | List mappers;
18 |
19 | public CompositeFeatureMapper() {
20 | this(DEFAULT_LOADER.loadServiceProviders(FeatureMapper.class));
21 | }
22 |
23 | public CompositeFeatureMapper(List mappers) {
24 | this.mappers = mappers;
25 | }
26 |
27 | @Override
28 | public void addFeatures(Map row, Fetcher fetcher,
29 | StoredDocument storedDocument) throws SQLException {
30 | for (FeatureMapper mapper : mappers) {
31 | mapper.addFeatures(row, fetcher, storedDocument);
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/FeatureMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.quaerite.core.StoredDocument;
5 |
6 | import java.nio.file.Path;
7 | import java.sql.ResultSet;
8 | import java.sql.SQLException;
9 | import java.util.Map;
10 |
11 | public interface FeatureMapper {
12 |
13 | public static final String REL_PATH_KEY = "relpath";
14 | public static final String ID_KEY = "id";
15 | void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException;
16 | }
17 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/CPUMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.ingest.FeatureMapper;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.Map;
11 |
12 | /**
13 | *
14 | */
15 | public class CPUMapper implements FeatureMapper {
16 |
17 | @Override
18 | public void addFeatures(Map row, Fetcher fetcher,
19 | StoredDocument storedDocument) throws SQLException {
20 | String val = row.get("cpu_warn");
21 | storedDocument.addNonBlankField("cpu_warn", val);
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/CaradocMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.ingest.FeatureMapper;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.Map;
11 |
12 | public class CaradocMapper implements FeatureMapper {
13 |
14 | @Override
15 | public void addFeatures(Map row, Fetcher fetcher,
16 | StoredDocument storedDocument) throws SQLException {
17 | String val = row.get("cd");
18 | storedDocument.addNonBlankField("cd", val);
19 | val = row.get("cd_warn");
20 | storedDocument.addNonBlankField("cd_warn", val);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ClamAVMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 |
4 | import org.apache.tika.pipes.fetcher.Fetcher;
5 | import org.tallison.ingest.FeatureMapper;
6 | import org.tallison.quaerite.core.StoredDocument;
7 |
8 | import java.nio.file.Path;
9 | import java.sql.ResultSet;
10 | import java.sql.SQLException;
11 | import java.util.Map;
12 |
13 | public class ClamAVMapper implements FeatureMapper {
14 |
15 | @Override
16 | public void addFeatures(Map row, Fetcher fetcher,
17 | StoredDocument storedDocument) throws SQLException {
18 | String val = row.get("clamav");
19 | storedDocument.addNonBlankField("clamav", val);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ESUtil.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | public class ESUtil {
4 | public static String stripIllegalUnicode(String s) {
5 | if (s == null) {
6 | return "";
7 | }
8 | return s.replaceAll("\u0000", "u0000")
9 | .replaceAll("\u001f", "u001f")
10 | .replaceAll("\u001e", "u001e")
11 | ;
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/MutoolMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.ingest.FeatureMapper;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.Map;
11 |
12 | /**
13 | * this should cover both mutool clean -s and mutool text
14 | * we aren't currently indexing text as extrated by mutool text
15 | */
16 | public class MutoolMapper implements FeatureMapper {
17 |
18 | @Override
19 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
20 | String val = row.get("mc_warn");
21 | storedDocument.addNonBlankField("mc_warn", val);
22 | val = row.get("mt_warn");
23 | storedDocument.addNonBlankField("mt_warn", val);
24 |
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/PDFMinerMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.ingest.FeatureMapper;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.Map;
11 |
12 | /**
13 | * this should cover both pdfminer dump and pdfminer text
14 | * we aren't currently indexing anything but the warning msgs
15 | */
16 | public class PDFMinerMapper implements FeatureMapper {
17 |
18 | @Override
19 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
20 | String val = row.get("pmd_warn");
21 | storedDocument.addNonBlankField("pmd_warn", val);
22 | val = row.get("pmt_warn");
23 | storedDocument.addNonBlankField("pmt_warn", val);
24 |
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/PDFResurrectMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import java.sql.SQLException;
4 | import java.util.Map;
5 | import java.util.regex.Matcher;
6 | import java.util.regex.Pattern;
7 |
8 | import org.apache.tika.pipes.fetcher.Fetcher;
9 | import org.tallison.ingest.FeatureMapper;
10 | import org.tallison.quaerite.core.StoredDocument;
11 |
12 | public class PDFResurrectMapper implements FeatureMapper {
13 | @Override
14 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument)
15 | throws SQLException {
16 | String stdout = row.get("pr");
17 | if (stdout == null) {
18 | return;
19 | }
20 | Matcher m = Pattern.compile(": (\\d+)").matcher(stdout);
21 | if (m.find()) {
22 | storedDocument.addNonBlankField("pr_updates", m.group(1));
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/ProfileFeatureMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.pipes.fetcher.Fetcher;
4 | import org.tallison.ingest.FeatureMapper;
5 | import org.tallison.quaerite.core.StoredDocument;
6 |
7 | import java.nio.file.Path;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.ArrayList;
11 | import java.util.Collections;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 |
18 | import static org.tallison.ingest.mappers.QPDFFeatureMapper.joinWith;
19 |
20 | public class ProfileFeatureMapper implements FeatureMapper {
21 | @Override
22 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException {
23 |
24 | storedDocument.addNonBlankField("fname", row.get("fname"));
25 | storedDocument.addNonBlankField("original_fname", row.get("fname"));
26 | storedDocument.addNonBlankField("shasum_256", row.get("shasum_256"));
27 | storedDocument.addNonBlankField("size", row.get("size"));
28 | storedDocument.addNonBlankField("collection", row.get("collection"));
29 | //these are all commoncrawl/web crawl specific... factor into another mapper?
30 | storedDocument.addNonBlankField("host_location", row.get("host_location"));
31 | storedDocument.addNonBlankField("country", row.get("country"));
32 | storedDocument.addNonBlankField("tld", row.get("tld"));
33 | storedDocument.addNonBlankField("detected_mime", row.get("detected_mime"));
34 | storedDocument.addNonBlankField("url", row.get("url"));
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/mappers/UniverseMapper.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import java.sql.SQLException;
4 | import java.util.Map;
5 |
6 | import org.tallison.ingest.FeatureMapper;
7 | import org.tallison.quaerite.core.StoredDocument;
8 |
9 | import org.apache.tika.pipes.fetcher.Fetcher;
10 |
11 | public class UniverseMapper implements FeatureMapper {
12 | @Override
13 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument)
14 | throws SQLException {
15 | storedDocument.addNonBlankField("universe", row.get("universe"));
16 | storedDocument.addNonBlankField("universe_validity",
17 | row.get("universe_validity"));
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/qpdf/QPDFResults.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.qpdf;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | public class QPDFResults {
7 |
8 | public Set keys = new HashSet<>();
9 | public Set parentAndKeys = new HashSet<>();
10 | public Set typeKeys = new HashSet<>();
11 | public Set keyValues = new HashSet<>();
12 | public Set filters = new HashSet<>();
13 | public int maxFilterCount = 0;
14 |
15 | @Override
16 | public String toString() {
17 | return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys +
18 | ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters +
19 | ", maxFilterCount=" + maxFilterCount + '}';
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/ingest/src/main/java/org/tallison/ingest/qpdf10/qpdf/QPDFResults.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.qpdf10.qpdf;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | public class QPDFResults {
7 |
8 | public Set keys = new HashSet<>();
9 | public Set parentAndKeys = new HashSet<>();
10 | public Set typeKeys = new HashSet<>();
11 | public Set keyValues = new HashSet<>();
12 | public Set filters = new HashSet<>();
13 | public int maxFilterCount = 0;
14 |
15 | @Override
16 | public String toString() {
17 | return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys +
18 | ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters +
19 | ", maxFilterCount=" + maxFilterCount + '}';
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/ingest/src/main/resources/META-INF/services/org.tallison.ingest.FeatureMapper:
--------------------------------------------------------------------------------
1 | #org.tallison.ingest.mappers.ArlingtonMapper
2 | #org.tallison.ingest.mappers.CaradocMapper
3 | #org.tallison.ingest.mappers.ClamAVMapper
4 | #org.tallison.ingest.mappers.CPUMapper
5 | #org.tallison.ingest.mappers.MutoolMapper
6 | #org.tallison.ingest.mappers.PDFBytesMapper
7 | #org.tallison.ingest.mappers.PDFCheckerMapper
8 | org.tallison.ingest.mappers.PDFInfoFeatureMapper
9 | #org.tallison.ingest.mappers.PDFMinerMapper
10 | org.tallison.ingest.mappers.ProfileFeatureMapper
11 | org.tallison.ingest.mappers.QPDFFeatureMapper
12 | org.tallison.ingest.mappers.StatusFeatureMapper
13 | #org.tallison.ingest.mappers.TikaFeatureMapper
14 | #org.tallison.ingest.mappers.MultiCompareMapper
15 | #org.tallison.ingest.mappers.PDFResurrectMapper
16 | #org.tallison.ingest.mappers.PDFFontsMapper
17 | #org.tallison.ingest.mappers.XPDFFontsMapper
18 | #org.tallison.ingest.mappers.UniverseMapper
--------------------------------------------------------------------------------
/ingest/src/main/resources/important-int-keys.txt:
--------------------------------------------------------------------------------
1 | /BitsPerComponent
2 | /BitsPerCoordinate
3 | /BitsPerSample
4 | /ca
5 | /CA
6 | /Colors
7 | /ColorTransform
8 | /Count
9 | /Descent
10 | /EarlyChange
11 | /F
12 | /Ff
13 | /FL
14 | /FontWeight
15 | /FormType
16 | /FunctionType
17 | /Gamma
18 | /HalftoneType
19 | /I
20 | /LC
21 | /Length
22 | /LJ
23 | /LW
24 | /M
25 | /ML
26 | /N
27 | /O
28 | /OPM
29 | /Order
30 | /P
31 | /PaintType
32 | /PatternType
33 | /Penalty
34 | /Position
35 | /Predictor
36 | /Q
37 | /R
38 | /Rotate
39 | /RT
40 | /S
41 | /ShadingType
42 | /SM
43 | /SMaskInData
44 | /St
45 | /TilingType
46 | /TP
47 | /UserUnit
48 | /V
49 | /Version
50 | /VerticesPerRow
51 | /Volume
52 | /W
53 | /WMode
--------------------------------------------------------------------------------
/ingest/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %t %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/ingest/src/main/resources/selectStar-minimal.sql:
--------------------------------------------------------------------------------
1 | select u.id,
2 | u.url as url,
3 | 's3://safedocs-cc-202109/'||p.path as fname,
4 | p.path as relpath,
5 | fetched_digest as shasum_256,
6 | 'CC-MAIN-2021-31' as collection,
7 | fetched_length as size,
8 | case
9 | when latitude is null
10 | then ''
11 | else latitude||','||longitude
12 | end as host_location,
13 | h.tld, h.country,
14 | pinfo.stderr pinfo_stderr,
15 | pinfo.stdout pinfo_stdout,
16 | pinfo.exit_value pinfo_exit,
17 | case
18 | when pinfo.stderr like 'Command Line Error: Incorrect password%' then 'encrypted'
19 | when pinfo.path is null then 'missing'
20 | when pinfo.timeout=true then 'timeout'
21 | when pinfo.exit_value <> 0 then 'crash'
22 | when length(pinfo.stderr) > 5 then 'warn'
23 | else 'success'
24 | end as pinfo_status,
25 | q.stderr q_stderr,
26 | q.exit_value q_exit,
27 | case
28 | when q.path is null then 'missing'
29 | when q.timeout=true then 'timeout'
30 | when q.exit_value <> 0 then 'crash'
31 | when length(q.stderr) > 5 then 'warn'
32 | else 'success'
33 | end as q_status
34 | from profiles p
35 | join cc_fetch f on p.path = f.path
36 | join cc_fetch_status s on f.status_id=s.id
37 | join cc_urls u on f.id=u.id
38 | join cc_hosts h on u.host=h.id
39 | join pdfinfo pinfo on pinfo.path=p.path
40 | join qpdf q on q.path = p.path
41 | order by u.id
--------------------------------------------------------------------------------
/ingest/src/main/resources/selectStar-sample.sql:
--------------------------------------------------------------------------------
1 | select u.id as id,
2 | 'CC-MAIN-2021-31-sample' as collection,
3 | case
4 | when m.name is null or length(m.name) = 0
5 | then 'UNKNOWN'
6 | else m.name
7 | end as detected_mime,
8 | case
9 | when latitude is not null
10 | then latitude||','||longitude
11 | else ''
12 | end as host_location,
13 | h.tld,
14 | case
15 | when h.country is not null
16 | then h.country
17 | else 'UNKNOWN'
18 | end as country
19 | from sample.cc_urls u
20 | join sample.cc_hosts h on u.host=h.id
21 | join sample.cc_detected_mimes m on u.detected_mime=m.id
22 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/ArlingtonMapperTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.junit.Test;
4 | import org.tallison.quaerite.core.StoredDocument;
5 |
6 | import java.util.List;
7 |
8 | import static org.junit.Assert.assertEquals;
9 | import static org.junit.Assert.assertTrue;
10 |
11 | public class ArlingtonMapperTest extends MapperTest {
12 |
13 | @Test
14 | public void testBasic() throws Exception {
15 | ArlingtonMapper mapper = new ArlingtonMapper();
16 | StoredDocument sd = new StoredDocument("");
17 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-687647-0.pdf.txt"), sd);
18 | assertEquals("Can't select any link", sd.getFields().get("a_warn"));
19 | }
20 |
21 | @Test
22 | public void testFailedToOpen() throws Exception {
23 | ArlingtonMapper mapper = new ArlingtonMapper();
24 | StoredDocument sd = new StoredDocument("");
25 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-688076-1.pdf.txt"), sd);
26 | assertEquals("fail", sd.getFields().get("a_status"));
27 | }
28 |
29 | @Test
30 | public void testDiffContexts() throws Exception {
31 | //GHOSTSCRIPT-687499-0.pdf.txt
32 | ArlingtonMapper mapper = new ArlingtonMapper();
33 | StoredDocument sd = new StoredDocument("");
34 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-687499-0.pdf.txt"), sd);
35 | boolean success = false;
36 | for (String s : (List)sd.getFields().get("a_warn")) {
37 | if (s.equals("object validated in two different contexts")) {
38 | success = true;
39 | }
40 | }
41 | assertTrue(success);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/MapperTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.net.URISyntaxException;
6 |
7 | import java.nio.file.Files;
8 | import java.nio.file.Paths;
9 | import java.time.Instant;
10 | import java.time.LocalDateTime;
11 | import java.time.ZoneId;
12 | import java.time.format.DateTimeFormatter;
13 | import java.util.Locale;
14 |
15 | import org.junit.Test;
16 |
17 | public class MapperTest {
18 |
19 | InputStream getPath(String relPath) throws IOException {
20 | try {
21 | String path = "/test-documents/"+relPath;
22 | return Files.newInputStream(Paths.get(this.getClass().getResource(path).toURI()));
23 | } catch (URISyntaxException e) {
24 | throw new IOException(e);
25 | }
26 | }
27 |
28 | @Test
29 | public void testDateParsing() throws Exception {
30 | String v = "Mon Apr 1 22:12:30 2013 UTC";
31 | v = v.replaceAll("\\s+", " ").trim();
32 | Instant instant = LocalDateTime.parse(v,
33 | DateTimeFormatter.ofPattern( "EEE MMM d HH:mm:ss yyyy z",
34 | Locale.US )
35 | )
36 | .atZone(ZoneId.of("UTC")).toInstant();
37 | System.out.println(instant);
38 |
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/PDFCheckerMapperTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import org.apache.tika.io.TikaInputStream;
4 | import org.junit.Test;
5 | import org.tallison.ingest.mappers.PDFCheckerMapper;
6 | import org.tallison.quaerite.core.StoredDocument;
7 |
8 | import java.io.InputStream;
9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 |
12 | import static org.junit.Assert.assertTrue;
13 |
14 | public class PDFCheckerMapperTest {
15 |
16 | @Test
17 | public void testBasic() throws Exception {
18 | PDFCheckerMapper mapper = new PDFCheckerMapper();
19 | Path p = Paths.get(
20 | PDFCheckerMapperTest.class.getResource(
21 | "/test-documents/pdfchecker/GHOSTSCRIPT-696838-0.zip-0.pdf.json").toURI());
22 | StoredDocument sd = new StoredDocument("id");
23 | try (InputStream is = TikaInputStream.get(p)) {
24 | mapper.processJson(is, sd);
25 | }
26 | String summaryInfo = sd.getFields().get("pc_summary_info").toString();
27 | assertTrue(summaryInfo.contains("can-be-optimized"));
28 | assertTrue(summaryInfo.contains("born-digital"));
29 | }
30 |
31 | @Test
32 | public void testFonts() throws Exception {
33 | PDFCheckerMapper mapper = new PDFCheckerMapper();
34 | Path p = Paths.get(
35 | PDFCheckerMapperTest.class.getResource(
36 | "/test-documents/pdfchecker/fonts-PDFBOX-1002-2.pdf.json").toURI());
37 | StoredDocument sd = new StoredDocument("id");
38 | try (InputStream is = TikaInputStream.get(p)) {
39 | mapper.processJson(is, sd);
40 | }
41 | String summaryInfo = sd.getFields().get("pc_summary_info").toString();
42 | assertTrue(summaryInfo.contains("can-be-optimized"));
43 | assertTrue(summaryInfo.contains("born-digital"));
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/PDFFontsMapperTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 |
4 | import java.nio.charset.StandardCharsets;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import org.apache.commons.io.IOUtils;
9 | import org.junit.Test;
10 | import org.tallison.quaerite.core.StoredDocument;
11 |
12 | public class PDFFontsMapperTest extends MapperTest {
13 |
14 | @Test
15 | public void testBasic() throws Exception {
16 | String stdout = IOUtils.toString(
17 | getPath("pdffonts/test-basic.txt"), StandardCharsets.UTF_8);
18 |
19 | PDFFontsMapper mapper = new PDFFontsMapper();
20 | StoredDocument sd = new StoredDocument("id");
21 | Map row = new HashMap<>();
22 | row.put("pdffonts_stdout", stdout);
23 | mapper.addFeatures(row, null, sd);
24 | System.out.println(sd);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/QPDFJsonExtractorTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 | import static org.junit.Assert.assertTrue;
4 |
5 | import java.io.IOException;
6 | import java.io.Reader;
7 | import java.net.URISyntaxException;
8 | import java.nio.charset.StandardCharsets;
9 | import java.nio.file.Files;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 |
13 | import org.junit.Test;
14 | import org.tallison.ingest.qpdf.QPDFJsonExtractor;
15 | import org.tallison.ingest.qpdf.QPDFResults;
16 |
17 | //these are tests for qpdf 11.x json v2
18 | public class QPDFJsonExtractorTest {
19 |
20 | @Test
21 | public void testBasic() throws Exception {
22 | try (Reader reader = getReader("/qpdfv11/qpdf.json")) {
23 | QPDFJsonExtractor ex = new QPDFJsonExtractor();
24 | QPDFResults results = ex.extract("id", reader);
25 | System.out.println(results);
26 | assertTrue(results.keyValues.contains("/Creator->Microsoft® Office Word 2007"));
27 | assertTrue(results.keyValues.contains(("/CreationDate->DATE")));
28 | }
29 | }
30 |
31 | private Reader getReader(String file) throws IOException {
32 | return Files.newBufferedReader(getPath(file), StandardCharsets.UTF_8);
33 | }
34 |
35 | private Path getPath(String file) throws IOException {
36 | try {
37 | return Paths.get(this.getClass().getResource("/test-documents/"+file).toURI());
38 | } catch (URISyntaxException e) {
39 | throw new IOException(e);
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/ingest/src/test/java/org/tallison/ingest/mappers/XPDFFontsMapperTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingest.mappers;
2 |
3 |
4 | import java.nio.charset.StandardCharsets;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import org.apache.commons.io.IOUtils;
9 | import org.junit.Test;
10 | import org.tallison.quaerite.core.StoredDocument;
11 |
12 | public class XPDFFontsMapperTest extends MapperTest {
13 |
14 | @Test
15 | public void testBasic() throws Exception {
16 | String stdout = IOUtils.toString(
17 | getPath("xpdffonts/test-basic.txt"), StandardCharsets.UTF_8);
18 |
19 | XPDFFontsMapper mapper = new XPDFFontsMapper();
20 | StoredDocument sd = new StoredDocument("id");
21 | Map row = new HashMap<>();
22 | row.put("xpdffonts_stdout", stdout);
23 | mapper.addFeatures(row, null, sd);
24 | System.out.println(sd);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-687647-0.pdf.txt:
--------------------------------------------------------------------------------
1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-687647-0.pdf" - PDFix v6.1.0
2 | Trailer
3 | Trailer->Root
4 | Trailer->Info
5 | Trailer->Root->Pages
6 | Trailer->Root->Outlines
7 | Trailer->Root->Pages->Kids
8 | Error: Can't select any link from [fn:SinceVersion(1.0,PageTreeNode),fn:SinceVersion(1.0,PageObject)] to validate provided object: [0] for object 4
9 | END
10 |
--------------------------------------------------------------------------------
/ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-688076-1.pdf.txt:
--------------------------------------------------------------------------------
1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix v6.1.0
2 | Error: Failed to open: "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix GetError(): Failed to open document.
3 | END
4 |
--------------------------------------------------------------------------------
/simple-ingester/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | file-observatory
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | simple-ingester
13 |
14 |
15 | 11
16 | 11
17 |
18 |
19 |
20 |
21 | org.apache.tika
22 | tika-core
23 |
24 |
25 | org.apache.tika
26 | tika-serialization
27 | ${tika.version}
28 |
29 |
30 | org.apache.httpcomponents
31 | httpclient
32 | 4.5.13
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/simple-ingester/src/main/java/org/tallison/ingester/IngesterCLI.java:
--------------------------------------------------------------------------------
1 | package org.tallison.ingester;
2 |
3 | public class IngesterCLI {
4 |
5 | public static void main(String[] args) {
6 |
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/simple-ingester/src/main/java/org/tallison/tika/parser/ConcatenatingParser.java:
--------------------------------------------------------------------------------
1 | package org.tallison.tika.parser;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.apache.tika.metadata.Metadata;
7 | import org.apache.tika.pipes.FetchEmitTuple;
8 |
9 | public class ConcatenatingParser {
10 |
11 | private List parsers = new ArrayList<>();
12 |
13 | public List parse(FetchEmitTuple tuple) {
14 | List results = new ArrayList<>();
15 |
16 | return results;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tika-addons/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | file-observatory
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-addons
13 | pom
14 |
15 | tika-pipes-reporter
16 | tika-eval-multicomparer
17 | tika-server-fuzzer
18 |
19 |
20 |
21 | 11
22 | 11
23 |
24 |
25 |
--------------------------------------------------------------------------------
/tika-addons/tika-eval-multicomparer/src/main/java/org/tallison/tika/eval/multi/ListGenerator.java:
--------------------------------------------------------------------------------
1 | package org.tallison.tika.eval.multi;
2 |
3 | import java.io.File;
4 | import java.util.HashSet;
5 | import java.util.Set;
6 |
7 | public class ListGenerator {
8 |
9 | public static void main(String[] args) throws Exception {
10 | Set seen = new HashSet<>();
11 | File tools = new File(".../data/extracts");
12 | for (File tool : tools.listFiles()) {
13 | for (File c : tool.listFiles()) {
14 | for (File e : c.listFiles()) {
15 | String n = e.getName().replaceAll(".json", "").replaceAll(".txt", "");
16 | if (! n.startsWith("._")) {
17 | seen.add(n);
18 | }
19 | }
20 | }
21 | }
22 | for (String n : seen) {
23 | System.out.println(n);
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/tika-addons/tika-pipes-reporter/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | tika-addons
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-pipes-reporter
13 |
14 |
15 | 11
16 | 11
17 |
18 |
19 |
20 |
21 | org.postgresql
22 | postgresql
23 |
24 |
25 | org.apache.tika
26 | tika-core
27 | provided
28 |
29 |
30 |
31 |
32 |
33 |
34 | maven-shade-plugin
35 | ${maven.shade.version}
36 |
37 |
38 | package
39 |
40 | shade
41 |
42 |
43 |
44 | false
45 |
46 |
47 |
48 | *:*
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/tika-addons/tika-server-fuzzer/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | tika-addons
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-server-fuzzer
13 |
14 |
15 | 14
16 | 14
17 |
18 |
19 |
20 |
21 | org.apache.tika
22 | tika-core
23 | ${tika.version}
24 |
25 |
26 | org.apache.tika
27 | tika-fuzzing
28 | ${tika.version}
29 |
30 |
31 | org.apache.tika
32 | tika-serialization
33 | ${tika.version}
34 |
35 |
36 | org.apache.cxf
37 | cxf-rt-rs-client
38 | ${cxf.version}
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/tika-containers/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | file-observatory
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-containers
13 | pom
14 |
15 | tika-pdftotext
16 | tika-pdfchecker
17 | tika-pdfspelunker
18 | tika-pdfjs
19 | tika-arlington
20 | tika-pipes-pdfinfo
21 | tika-pipes-siegfried
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/tika-containers/tika-arlington/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER
2 |
3 | RUN apt-get update && apt-get install curl g++-8 gcc-8 cmake git -y
4 |
5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
6 |
7 | RUN git clone https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \
8 | cd /arlington-pdf-model && git checkout fab5b58
9 |
10 | RUN cd /arlington-pdf-model/TestGrammar && \
11 | cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \
12 | cmake --build cmake-linux/debug --config Debug
13 |
14 | RUN mkdir /tika-bin && cd /tika-bin && \
15 | curl https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/2.4.1/tika-server-core-2.4.1.jar --output tika-server-core.jar
16 |
17 |
18 | FROM amd64/openjdk:11.0.8-slim-buster
19 |
20 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin
21 |
22 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest
23 |
24 | RUN mkdir /tika-bin
25 | COPY --from=GRAMMAR_CHECKER_BUILDER /tika-bin/tika-server-core.jar /tika-bin/tika-server-core.jar
26 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
27 |
28 | #once we upgrade to > tika 2.4.1, we can get rid of this custom regex parser
29 | COPY target/tika-arlington-1.0.0-SNAPSHOT.jar /tika-bin/tika-arlington.jar
30 |
31 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
32 |
--------------------------------------------------------------------------------
/tika-containers/tika-arlington/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | tika-containers
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-arlington
13 |
14 |
15 |
16 | org.apache.tika
17 | tika-core
18 | provided
19 |
20 |
21 |
22 |
23 |
24 | org.apache.maven.plugins
25 | maven-shade-plugin
26 | ${maven.shade.version}
27 |
28 |
29 | package
30 |
31 | shade
32 |
33 |
34 |
35 | false
36 |
37 |
38 |
39 | *:*
40 |
41 |
42 |
43 |
44 | org.tallison.observatory.RegexCaptureParser
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/tika-containers/tika-exiftool/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from: https://github.com/Miljar/exiftool-docker/blob/master/Dockerfile
2 | FROM amd64/openjdk:11.0.8-slim-buster
3 | ENV EXIFTOOL_VERSION=12.38
4 | ENV TIKA_VERSION=2.2.1
5 |
6 | RUN apk add --no-cache perl make
7 | RUN cd /tmp \
8 | && wget http://www.sno.phy.queensu.ca/~phil/exiftool/Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \
9 | && tar -zxvf Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \
10 | && cd Image-ExifTool-${EXIFTOOL_VERSION} \
11 | && perl Makefile.PL \
12 | && make test \
13 | && make install \
14 | && cd .. \
15 | && rm -rf Image-ExifTool-${EXIFTOOL_VERSION}
16 |
17 | RUN mkdir /tika-bin \
18 | && cd /tika-bin \
19 | && wget https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/${TIKA_VERSION}/tika-server-core-{$TIKA_VERSION}.jar
20 |
21 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
22 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
23 |
24 | #e.g.
25 | #docker run -d -p 9998:9998
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER
5 | RUN mkdir /pdfchecker-bin
6 |
7 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz
8 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz
9 |
10 | RUN mkdir /tika-bin
11 | COPY target/tika-pdfchecker-1.0.0-SNAPSHOT.jar /tika-bin/tika-pdfchecker-1.0.0-SNAPSHOT.jar
12 |
13 | #find a more elegant way of grabbing this after we release it
14 | COPY tika-server-core-2.0.0-SNAPSHOT.jar /tika-bin/tika-server-core-2.0.0-SNAPSHOT.jar
15 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
16 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
17 |
18 | #e.g.
19 | #docker run -d -p 9998:9998
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/my-tika-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 |
21 |
22 | 120000
23 |
24 |
25 |
26 |
27 | 9998
28 | 180000
29 | false
30 | 10000000
31 |
32 | -Xmx2g
33 |
34 |
35 | rmeta
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | org.tallison.tika.parsers.pdfchecker.PDFChecker
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/test/java/TikaPDFToTextTest.java:
--------------------------------------------------------------------------------
1 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
2 | import org.apache.cxf.jaxrs.client.WebClient;
3 | import org.apache.tika.TikaTest;
4 | import org.apache.tika.metadata.Metadata;
5 | import org.apache.tika.metadata.serialization.JsonMetadataList;
6 | import org.junit.Ignore;
7 | import org.junit.Test;
8 |
9 | import javax.ws.rs.core.Response;
10 | import java.io.InputStream;
11 | import java.io.InputStreamReader;
12 | import java.io.Reader;
13 | import java.util.List;
14 |
15 | import static java.nio.charset.StandardCharsets.UTF_8;
16 | import static org.junit.Assert.assertEquals;
17 |
18 | public class TikaPDFToTextTest extends TikaTest {
19 | private static String END_POINT = "http://localhost:9998";
20 | private static final String META_PATH = "/rmeta";
21 |
22 | @Test
23 | @Ignore("once container is running")
24 | public void testBasic() throws Exception {
25 | Response response = WebClient
26 | .create(END_POINT + META_PATH)
27 | .accept("application/json")
28 | .acceptEncoding("gzip")
29 | .put(ClassLoader.getSystemResourceAsStream("test-documents/testPDF.pdf"));
30 |
31 | Reader reader = null;
32 | String encoding = response.getHeaderString("content-encoding");
33 | if ("gzip".equals(encoding)) {
34 | reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
35 | } else {
36 | reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
37 | }
38 | List metadataList = JsonMetadataList.fromJson(reader);
39 | assertEquals(1, metadataList.size());
40 | assertEquals("born-digital", metadataList.get(0).get("pc_summary_info"));
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf
--------------------------------------------------------------------------------
/tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/tika-containers/tika-pdfium/my-args.gn:
--------------------------------------------------------------------------------
1 | # Set build arguments here. See `gn help buildargs`.
2 |
3 | # need this to build pdfium_test
4 | pdf_is_standalone = true
5 |
6 | #other options are commented out below
7 | #use_goma = true # Googlers only. Make sure goma is installed and running first.
8 | #is_debug = true # Enable debugging features.
9 |
10 | # Set true to enable experimental Skia backend.
11 | #pdf_use_skia = false
12 | # Set true to enable experimental Skia backend (paths only).
13 | #pdf_use_skia_paths = false
14 |
15 | #pdf_enable_xfa = true # Set false to remove XFA support (implies JS support).
16 | #pdf_enable_v8 = true # Set false to remove Javascript support.
17 | #is_component_build = false # Disable component build (Though it should work)
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs-selenium/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | tika-containers
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-pdfjs-selenium
13 |
14 |
15 | 11
16 | 11
17 | 3.141.59
18 |
19 |
20 |
23 |
24 |
25 | org.seleniumhq.selenium
26 | selenium-api
27 | ${selenium.version}
28 |
29 |
30 | org.seleniumhq.selenium
31 | selenium-remote-driver
32 | ${selenium.version}
33 |
34 |
35 | org.seleniumhq.selenium
36 | selenium-server
37 | ${selenium.version}
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs-selenium/src/main/java/FirefoxSeleniumExample.java:
--------------------------------------------------------------------------------
1 | import org.openqa.selenium.firefox.FirefoxBinary;
2 | import org.openqa.selenium.firefox.FirefoxDriver;
3 | import org.openqa.selenium.firefox.FirefoxOptions;
4 |
5 | public class FirefoxSeleniumExample {
6 | public static void main(String[] args) {
7 | FirefoxBinary firefoxBinary = new FirefoxBinary();
8 | firefoxBinary.addCommandLineOptions("--headless");
9 | System.setProperty("webdriver.gecko.driver", "/Users/allison/tools/firefox/geckodriver");
10 | FirefoxOptions firefoxOptions = new FirefoxOptions();
11 | firefoxOptions.setBinary(firefoxBinary);
12 | FirefoxDriver driver = new FirefoxDriver(firefoxOptions);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:16.13.0
2 |
3 | #make sure you have enough memory to build this --memory=900
4 | RUN npm install -g gulp-cli
5 |
6 | #Option A: grab and build a specific release
7 | #RUN apt-get update && apt-get -y install wget openjdk-11-jre
8 | #RUN mkdir /builddir && cd /builddir && \
9 | # wget https://github.com/mozilla/pdf.js/archive/refs/tags/v2.11.338.tar.gz && \
10 | # tar -xzvf v2.11.338.tar.gz && mv pdf.js-2.11.338 pdf.js && \
11 | # cd pdf.js && npm install && gulp dist-install && \
12 | # rm /builddir/v2.11.338.tar.gz
13 |
14 | #Option B: build from main
15 | RUN apt-get update && apt-get -y install git openjdk-11-jre
16 | RUN mkdir /builddir && cd /builddir && \
17 | git clone https://github.com/mozilla/pdf.js && cd pdf.js && \
18 | npm install && gulp dist-install
19 |
20 | COPY js/my-getinfo.js /builddir/pdf.js/examples/node/my-getinfo.js
21 |
22 | # TODO: figure two stage build and what we can jettison for a smaller container
23 |
24 | RUN mkdir /tika-bin/
25 | COPY target/tika-pdfjs-1.0.0-SNAPSHOT.jar /tika-bin/
26 | #find a more elegant way of grabbing this after we release it
27 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/
28 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
29 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
30 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-basic.txt:
--------------------------------------------------------------------------------
1 | # Document Loaded
2 | Number of Pages: 4
3 |
4 | # Metadata Is Loaded
5 | ## Info
6 | {
7 | "PDFFormatVersion": "1.5",
8 | "Language": "en-US",
9 | "EncryptFilterName": null,
10 | "IsLinearized": false,
11 | "IsAcroFormPresent": false,
12 | "IsXFAPresent": false,
13 | "IsCollectionPresent": false,
14 | "IsSignaturesPresent": false,
15 | "Producer": "Microsoft® Word 2016",
16 | "Creator": "Microsoft® Word 2016",
17 | "CreationDate": "D:20210421211209+00'00'",
18 | "ModDate": "D:20210421211209+00'00'"
19 | }
20 |
21 | # Page 1
22 | Size: 612x792
23 |
24 | Warning: TT: undefined function: 32
25 | Warning: fetchStandardFontData: failed to fetch file "FoxitSans.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
26 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerif.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
27 | ## Text Content
28 | here is some page 1 content
29 |
30 | # Page 2
31 | Size: 612x792
32 |
33 | ## Text Content
34 | some page 2 content
35 |
36 | # Page 3
37 | Size: 612x792
38 |
39 | ## Text Content
40 | Some page 3 content
41 |
42 | # Page 4
43 | Size: 612x792
44 |
45 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
46 | ## Text Content
47 | Some more text
48 |
49 | # End of Document
50 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp.txt:
--------------------------------------------------------------------------------
1 | # Random Key: 765668851
2 | # Document Loaded key=765668851
3 | # Number of Pages: 2 key=765668851
4 |
5 | # Metadata Is Loaded key=765668851
6 | ## Info key=765668851
7 | {
8 | "PDFFormatVersion": "1.6",
9 | "Language": null,
10 | "EncryptFilterName": null,
11 | "IsLinearized": true,
12 | "IsAcroFormPresent": false,
13 | "IsXFAPresent": false,
14 | "IsCollectionPresent": false,
15 | "IsSignaturesPresent": false,
16 | "CreationDate": "D:20210402144320-04'00'",
17 | "Creator": "PScript5.dll Version 5.2.2",
18 | "ModDate": "D:20210402154701-04'00'",
19 | "Producer": "Acrobat Distiller 20.0 (Windows)",
20 | "Title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)"
21 | }
22 |
23 | ## Metadata key=765668851
24 | {
25 | "xmp:modifydate": "2021-04-02T15:47:01-04:00",
26 | "xmp:createdate": "2021-04-02T14:43:20-04:00",
27 | "xmp:metadatadate": "2021-04-02T15:47:01-04:00",
28 | "xmp:creatortool": "PScript5.dll Version 5.2.2",
29 | "dc:format": "application/pdf",
30 | "dc:creator": [],
31 | "dc:title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)",
32 | "xmpmm:documentid": "uuid:1cd7d060-dd8f-463c-bfa8-18072b031ff2",
33 | "xmpmm:instanceid": "uuid:327587b5-f503-4f7a-b4b2-444c4ead47ad",
34 | "pdf:producer": "Acrobat Distiller 20.0 (Windows)"
35 | }
36 |
37 | # Page 1 key=765668851
38 | # Size: 612x792 key=765668851
39 |
40 | Info: TT: CALL empty stack (or invalid entry).
41 | Info: TT: CALL empty stack (or invalid entry).
42 | Info: TT: CALL empty stack (or invalid entry).
43 | Info: TT: CALL empty stack (or invalid entry).
44 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.".
45 | Info: page=1 - getTextContent: time=141ms
46 | ## Text Content key=765668851
47 | page 1 content
48 |
49 | # Page 2 key=765668851
50 | # Size: 612x792 key=765668851
51 |
52 | Info: page=2 - getTextContent: time=33ms
53 | ## Text Content key=765668851
54 | page 2 content
55 |
56 | # End of Document key=765668851
--------------------------------------------------------------------------------
/tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp2.txt:
--------------------------------------------------------------------------------
1 | # Random Key: 367480315
2 | # Document Loaded key=367480315
3 | # Number of Pages: 1 key=367480315
4 |
5 | # Metadata Is Loaded key=367480315
6 |
7 | ## Info key=367480315
8 | {
9 | "PDFFormatVersion": "1.5",
10 | "IsLinearized": false,
11 | "IsAcroFormPresent": true,
12 | "IsXFAPresent": false,
13 | "Trapped": {
14 | "name": "False"
15 | },
16 | "Custom": {
17 | "PTEX.Fullbanner": "This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/Debian) kpathsea version 6.2.3"
18 | }
19 | }
20 |
21 | # Page 1 key=367480315
22 | # Size: 595.276x841.89 key=367480315
23 | Info: page=1 - getTextContent: time=40ms
24 | ## Text Content key=367480315
25 | Name Copy Reset
26 | # End of Document key=367480315
27 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | org.tallison.tika.parsers.pdf.PDFSpelunker
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/java/org/tallison/tika/parsers/image/ICCImageParserTest.java:
--------------------------------------------------------------------------------
1 | package org.tallison.tika.parsers.image;
2 |
3 | import java.io.InputStream;
4 |
5 | import org.junit.Test;
6 |
7 | import org.apache.tika.TikaTest;
8 | import org.apache.tika.config.TikaConfig;
9 | import org.apache.tika.parser.AutoDetectParser;
10 | import org.apache.tika.parser.Parser;
11 |
12 | public class ICCImageParserTest extends TikaTest {
13 |
14 | @Test
15 | public void testBasic() throws Exception {
16 | try (InputStream is = this.getClass().getResourceAsStream("/config/my-tika-config.xml")) {
17 | Parser p = new AutoDetectParser(new TikaConfig(is));
18 | debug(getRecursiveMetadata("baseball.jpg", p));
19 | }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/non-compliant1.txt:
--------------------------------------------------------------------------------
1 | Profile: 'data/iccs/7c/68/7c68fd34c873bf7db8faa3a1133d176a7c92a88f8a05d482d406857ee212ce98'
2 | Profile ID: e798cc1d9f659a6155ac35ad9ac383bb
3 | Size: 1829077(0x1be8d5) bytes
4 |
5 | Header
6 | ------
7 | Attributes: Reflective | Glossy
8 | Cmm: Heidelberg
9 | Creation Date: 2/28/2007 08:00:00
10 | Creator: 'HDM ' = 48444D20
11 | Data Color Space: CmykData
12 | Flags EmbeddedProfileFalse | UseAnywhere
13 | PCS Color Space: LabData
14 | Platform: Unknown
15 | Rendering Intent: Relative Colorimetric
16 | Profile Class: OutputClass
17 | Profile SubClass: Not Defined
18 | Version: 2.40
19 | Illuminant: X=0.9642, Y=1.0000, Z=0.8249
20 | Spectral PCS: NoSpectralData
21 | Spectral PCS Range: Not Defined
22 | BiSpectral Range: Not Defined
23 | MCS Color Space: Not Defined
24 |
25 | Profile Tags
26 | ------------
27 | Tag ID Offset Size Pad
28 | ---- ------ ------ ---- ---
29 | copyrightTag 'cprt' 288 103 1
30 | mediaWhitePointTag 'wtpt' 392 20 0
31 | AToB0Tag 'A2B0' 412 396852 0
32 | BToA0Tag 'B2A0' 397264 291132 0
33 | gamutTag 'gamt' 688396 33840 0
34 | AToB1Tag 'A2B1' 722236 396852 0
35 | BToA1Tag 'B2A1' 1119088 291132 0
36 | AToB2Tag 'A2B2' 412 396852 0
37 | BToA2Tag 'B2A2' 1410220 291132 0
38 | grayTRCTag 'kTRC' 1701352 524 0
39 | Unknown 'hd10' = 68643130 'hd10' 1701876 364 0
40 | profileDescriptionTag 'desc' 1702240 152 0
41 | charTargetTag 'targ' 1702392 126685 0
42 |
43 |
44 | Validation Report
45 | -----------------
46 | Profile violates ICC specification
47 |
48 | Warning! - OutputClassTag exclusion test failed.
49 | Warning! - Unknown 'hd10' = 68643130: - Unknown Tag.
50 | NonCompliant! - File size is not a multiple of 4 bytes (last tag needs padding?).
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc1.txt:
--------------------------------------------------------------------------------
1 | Unable to parse 'data/blah.tgz' as ICC profile!
2 |
3 | Validation Report
4 | -----------------
5 | Profile has Critical Error(s) that violate ICC specification.
6 |
7 | Error! - - Unable to read profile!**
8 | Profile has invalid structure!
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc2.txt:
--------------------------------------------------------------------------------
1 | Unable to parse 'data/iccs/86/20/862090af4442059ff416679acb001ae23acc18852f2dc430d0845c061b937e9c' as ICC profile!
2 |
3 | Validation Report
4 | -----------------
5 | Profile has Critical Error(s) that violate ICC specification.
6 |
7 | NonCompliant! - Bad Header File Size
8 | Error! - - AToB0Tag - Tag has invalid structure!
9 | Error! - - AToB1Tag - Tag has invalid structure!
10 | Error! - - AToB2Tag - Tag has invalid structure!
11 | Error! - - BToA0Tag - Tag has invalid structure!
12 | Error! - - BToA1Tag - Tag has invalid structure!
13 | Error! - - BToA2Tag - Tag has invalid structure!
14 | Error! - - gamutTag - Tag has invalid structure!
15 | Error! - - Unknown 'AS00' = 41533030 - Tag has invalid structure!
--------------------------------------------------------------------------------
/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf
--------------------------------------------------------------------------------
/tika-containers/tika-pdftotext/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER
5 | #poppler/data pairs
6 | #21.02.0/0.4.10
7 | #20.09.0/0.4.9
8 | #0.86.1/0.4.9
9 |
10 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
11 | RUN wget https://poppler.freedesktop.org/poppler-data-0.4.11.tar.gz \
12 | && tar -xf poppler-data-0.4.11.tar.gz \
13 | && cd poppler-data-0.4.11 \
14 | && make install \
15 | && cd .. \
16 | && wget https://poppler.freedesktop.org/poppler-21.11.0.tar.xz \
17 | && tar -xf poppler-21.11.0.tar.xz \
18 | && cd poppler-21.11.0 \
19 | && mkdir build \
20 | && cd build \
21 | && cmake -DENABLE_BOOST=OFF .. \
22 | && make \
23 | && make install \
24 | && ldconfig
25 | #CMD tail -f /dev/null
26 |
27 | FROM amd64/openjdk:11.0.8-slim-buster
28 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
29 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
30 |
31 | RUN apt-get update && apt-get install bash ca-certificates \
32 | libjpeg62-turbo libcairo2 libxml2 \
33 | fontconfig liblcms2-2 \
34 | libtiff5 -y
35 | # &&\
36 | #libopenjpeg5
37 | #libstdc++6 && \
38 | #addgroup -S appgroup && \
39 | #adduser -S appuser -G appgroup -h /work && \
40 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
41 |
42 | RUN mkdir /tika-bin
43 |
44 | #find a more elegant way of grabbing this after we release it
45 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar
46 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
47 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
48 |
49 | #e.g.
50 | #docker run -d -p 9998:9998
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=23.03.0
8 | ENV POPPLER_DATA_VERSION=0.4.12
9 |
10 | RUN apt-get update && apt-get install locales bash wget build-essential cmake libfreetype6-dev pkg-config \
11 | libfontconfig-dev libjpeg-dev libopenjp2-7-dev \
12 | #these are for temurin
13 | apt-transport-https gnupg -y
14 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
15 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
16 | && cd poppler-data-${POPPLER_DATA_VERSION} \
17 | && make install \
18 | && cd .. \
19 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
20 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
21 | && cd poppler-${POPPLER_VERSION} \
22 | && mkdir build \
23 | && cd build \
24 | && cmake -DENABLE_BOOST=OFF ..\
25 | && make \
26 | && make install \
27 | && ldconfig
28 |
29 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
30 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
31 | && apt-get update && apt-get install temurin-11-jre -y
32 |
33 | RUN mkdir /tika-bin
34 | COPY target/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar /tika-bin
35 | COPY log4j2.xml /tika-bin
36 | COPY pipes-log4j2.xml /tika-bin
37 |
38 |
39 | ENV LANG en_US.UTF-8
40 | ENV LANGUAGE en_US:en
41 | ENV LC_ALL en_US.UTF-8
42 |
43 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar"]
44 | #need to specify tika-config.xml on commandline, e.g.:
45 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345
46 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml
47 |
48 | #WORKDIR /work
49 |
50 |
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-pdfinfo/pipes-log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.20.2-bullseye
2 |
3 |
4 | RUN apt-get update && apt-get install file \
5 | #these are for temurin
6 | apt-transport-https gnupg -y
7 | RUN go install github.com/richardlehane/siegfried/cmd/sf@latest && sf -update
8 |
9 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
10 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
11 | && apt-get update && apt-get install temurin-11-jre -y
12 |
13 | RUN mkdir /tika-bin
14 | COPY target/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar /tika-bin
15 | COPY log4j2.xml /tika-bin
16 | COPY pipes-log4j2.xml /tika-bin
17 |
18 |
19 | ENV LANG en_US.UTF-8
20 | ENV LANGUAGE en_US:en
21 | ENV LC_ALL en_US.UTF-8
22 |
23 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar"]
24 | #need to specify tika-config.xml on commandline, e.g.:
25 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345
26 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml
27 |
28 | #WORKDIR /work
29 |
30 |
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/tika-containers/tika-pipes-siegfried/pipes-log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.4-slim-buster
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends \
5 | openjdk-11-jre
6 |
7 | #TODO
8 | RUN python -m pip install --upgrade pip && pip install pypdf2==2.1.0
9 |
10 | RUN mkdir /pypdf2cli
11 | COPY scripts/PyPDF2Cli.py /pypdf2cli
12 | RUN chmod a+x /pypdf2cli/PyPDF2Cli.py
13 |
14 | RUN mkdir /tika-bin
15 |
16 | #find a more elegant way of grabbing this after we release it
17 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar
18 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml
19 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"]
20 |
--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | file-observatory
7 | org.tallison
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | tika-pypdf2
13 |
14 |
15 | 11
16 | 11
17 |
18 |
19 |
--------------------------------------------------------------------------------
/tika-containers/tika-pypdf2/scripts/PyPDF2Cli.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from PyPDF2 import PdfReader
4 |
5 | reader = PdfReader(sys.argv[1])
6 |
7 | # reading all the pages content one by one
8 | with open(sys.argv[2], "w", encoding="utf-8") as output:
9 | for page in reader.pages:
10 | output.write(page.extract_text())
11 | output.write("\n")
12 |
--------------------------------------------------------------------------------
/tool-runners/arlington/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER
2 |
3 | RUN apt-get update && apt-get install g++-8 gcc-8 cmake git -y
4 |
5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
6 |
7 | RUN git clone https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \
8 | cd /arlington-pdf-model && git checkout 908a7be
9 |
10 | RUN cd /arlington-pdf-model/TestGrammar && \
11 | cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \
12 | cmake --build cmake-linux/debug --config Debug
13 |
14 |
15 | FROM amd64/openjdk:11.0.8-slim-buster
16 |
17 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin
18 |
19 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest
20 |
21 | COPY target/arlington-1.0.0-SNAPSHOT.jar /arlington-1.0.0-SNAPSHOT.jar
22 |
23 |
24 | ENTRYPOINT ["java","-jar","/arlington-1.0.0-SNAPSHOT.jar"]
25 | #WORKDIR /work
26 | # for debugging
27 | # docker run -it --entrypoint /bin/bash --name a2 -v /Users/.../Desktop/tool-runner-work:/data 806db3cdfa81
28 |
29 |
--------------------------------------------------------------------------------
/tool-runners/arlington/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/file-obs-tika.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234
5 | NUM_THREADS=20
6 | IS_DELTA=true
--------------------------------------------------------------------------------
/tool-runners/arlington/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=info, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/caradoc/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | FROM debian:stretch as CARADOC_BUILDER
3 | RUN apt-get update &&\
4 | apt-get install -y\
5 | ocaml\
6 | opam\
7 | zlib1g-dev\
8 | libgmp-dev\
9 | pkg-config\
10 | m4\
11 | zlib1g-dev\
12 | ocaml-findlib\
13 | libcryptokit-ocaml-dev\
14 | libounit-ocaml-dev\
15 | libcurses-ocaml-dev\
16 | menhir &&\
17 | git clone --depth=1 --single-branch https://github.com/caradoc-org/caradoc.git
18 | WORKDIR /caradoc
19 | RUN make
20 |
21 |
22 | FROM amd64/openjdk:11.0.8-slim-buster
23 | COPY --from=CARADOC_BUILDER /caradoc/_build/src/main.native /usr/local/bin/caradoc
24 | # Install dependencies for caradoc binary
25 | RUN apt-get update &&\
26 | apt-get install -y\
27 | libtinfo5\
28 | libncursesw5
29 |
30 |
31 | COPY target/caradoc-1.0.0-SNAPSHOT.jar /caradoc-1.0.0-SNAPSHOT.jar
32 | ENTRYPOINT ["java","-jar","/caradoc-1.0.0-SNAPSHOT.jar"]
33 | #e.g.
34 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container
35 | # docker build -t mutool-clean-image .
36 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10
--------------------------------------------------------------------------------
/tool-runners/caradoc/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/clamav/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://github.com/mko-x/docker-clamav/blob/master/alpine/main/Dockerfile
3 | FROM alpine:3.12
4 | LABEL maintainer="Markus Kosmal "
5 |
6 | RUN apk add --no-cache openjdk11 bash clamav clamav-daemon rsyslog wget clamav-libunrar
7 |
8 | COPY conf /etc/clamav
9 |
10 | RUN mkdir /var/run/clamav && \
11 | chown clamav:clamav /var/run/clamav && \
12 | chmod 750 /var/run/clamav
13 | #&& \
14 | #chown -R clamav:clamav bootstrap.sh check.sh /etc/clamav && \
15 | #chmod u+x bootstrap.sh check.sh
16 |
17 | RUN /usr/bin/freshclam
18 | #EXPOSE 3310/tcp
19 |
20 | COPY target/clamav-1.0.0-SNAPSHOT.jar /clamav-1.0.0-SNAPSHOT.jar
21 | COPY exec.sh /exec.sh
22 | RUN ["chmod", "+x", "/exec.sh"]
23 | CMD ["/exec.sh"]
24 |
--------------------------------------------------------------------------------
/tool-runners/clamav/conf/clam.conf:
--------------------------------------------------------------------------------
1 | ###############
2 | # General
3 | ###############
4 |
5 | DatabaseDirectory /var/lib/clamav
6 | TemporaryDirectory /tmp
7 | LogTime yes
8 | PidFile /run/clamav/clamd.pid
9 | LocalSocket /run/clamav/clamd.sock
10 | TCPSocket 3310
11 | Foreground no
12 |
13 | ###############
14 | # Results
15 | ###############
16 |
17 | DetectPUA yes
18 | ExcludePUA NetTool
19 | ExcludePUA PWTool
20 | AlgorithmicDetection yes
21 | Bytecode yes
22 |
23 | ###############
24 | # Scan
25 | ###############
26 |
27 | ScanPE yes
28 | DisableCertCheck yes
29 | ScanELF yes
30 | AlertBrokenExecutables yes
31 | ScanOLE2 yes
32 | ScanPDF yes
33 | ScanSWF yes
34 | ScanMail yes
35 | PhishingSignatures yes
36 | PhishingScanURLs yes
37 | ScanHTML yes
38 | ScanArchive yes
39 |
40 | ###############
41 | # Scan
42 | ###############
43 |
44 | MaxScanSize 300M
45 | MaxFileSize 100M
46 | MaxRecursion 30
47 | MaxFiles 50000
48 | MaxEmbeddedPE 40M
49 | MaxHTMLNormalize 40M
50 | MaxHTMLNoTags 2M
51 | MaxScriptNormalize 5M
52 | MaxZipTypeRcg 1M
53 | MaxPartitions 128
54 | MaxIconsPE 200
55 | PCREMatchLimit 10000
56 | PCRERecMatchLimit 10000
--------------------------------------------------------------------------------
/tool-runners/clamav/conf/freshclam.conf:
--------------------------------------------------------------------------------
1 | ###############
2 | # General
3 | ###############
4 |
5 | DatabaseDirectory /var/lib/clamav
6 | LogSyslog yes
7 | LogTime yes
8 | PidFile /run/clamav/freshclam.pid
9 |
10 | ###############
11 | # Updates
12 | ###############
13 |
14 | DatabaseMirror database.clamav.net
15 | ScriptedUpdates yes
16 | NotifyClamd /etc/clamav/clamd.conf
17 | SafeBrowsing yes
18 | Bytecode yes
--------------------------------------------------------------------------------
/tool-runners/clamav/exec.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #!/bin/bash
3 | # copied from: https://github.com/mko-x/docker-clamav/blob/master/alpine/main/bootstrap.sh
4 | set -e
5 |
6 | if [[ ! -z "${FRESHCLAM_CONF_FILE}" ]]; then
7 | echo "[bootstrap] FRESHCLAM_CONF_FILE set, copy to /etc/clamav/freshclam.conf"
8 | mv /etc/clamav/freshclam.conf /etc/clamav/freshclam.conf.bak
9 | cp -f ${FRESHCLAM_CONF_FILE} /etc/clamav/freshclam.conf
10 | fi
11 |
12 | if [[ ! -z "${CLAMD_CONF_FILE}" ]]; then
13 | echo "[bootstrap] CLAMD_CONF_FILE set, copy to /etc/clamav/clam.conf"
14 | mv /etc/clamav/clamd.conf /etc/clamav/clamd.conf.bak
15 | cp -f ${CLAMD_CONF_FILE} /etc/clamav/clamd.conf
16 | fi
17 |
18 | MAIN_FILE="/var/lib/clamav/main.cvd"
19 |
20 | #if [ ! -f ${MAIN_FILE} ]; then
21 | # echo "[bootstrap] Initial clam DB download."
22 | # /usr/bin/freshclam
23 | #fi
24 |
25 | #echo "[bootstrap] Schedule freshclam DB updater."
26 | #/usr/bin/freshclam -d -c 6
27 |
28 | echo "[bootstrap] Run clamav daemon..."
29 | /usr/sbin/clamd -c /etc/clamav/clam.conf
30 | echo "[bootstrap] process the files!"
31 | java -jar /clamav-1.0.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/tool-runners/clamav/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/file-obs-tika.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234
5 | NUM_THREADS=20
6 | IS_DELTA=true
--------------------------------------------------------------------------------
/tool-runners/fileprofiler/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 |
3 | COPY target/fileprofiler-1.0.0-SNAPSHOT.jar /fileprofiler-1.0.0-SNAPSHOT.jar
4 |
5 | ENTRYPOINT ["java","-jar","/fileprofiler-1.0.0-SNAPSHOT.jar"]
6 |
--------------------------------------------------------------------------------
/tool-runners/fileprofiler/README.txt:
--------------------------------------------------------------------------------
1 | Load basic provenance information -- file size, shasum, collection
--------------------------------------------------------------------------------
/tool-runners/fileprofiler/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/tool-runners/gstotext/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | FROM amd64/openjdk:11.0.8-slim-buster
3 | RUN apt-get update && apt-get install wget -y
4 | # &&\
5 | #libopenjpeg5
6 | #libstdc++6 && \
7 | #addgroup -S appgroup && \
8 | #adduser -S appuser -G appgroup -h /work && \
9 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
10 | RUN wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9550/ghostscript-9.55.0-linux-x86_64.tgz \
11 | && tar -xf ghostscript-9.55.0-linux-x86_64.tgz
12 |
13 | COPY target/gstotext-1.0.0-SNAPSHOT.jar /gstotext-1.0.0-SNAPSHOT.jar
14 |
15 | ENTRYPOINT ["java","-jar","/gstotext-1.0.0-SNAPSHOT.jar"]
16 | #WORKDIR /work
17 |
18 |
--------------------------------------------------------------------------------
/tool-runners/gstotext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/itext/README.md:
--------------------------------------------------------------------------------
1 | This wrapper of iText's parser requires a commercial license key.
2 |
3 | This code was not written nor used with the AGPL license.
4 |
5 | Many thanks to iText for granting a custom evaluation license for this project.
--------------------------------------------------------------------------------
/tool-runners/itext/src/main/resources/META-INF/services/org.apache.tika.parser.Parser:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | org.tallison.tika.parser.itext.ITextParser
--------------------------------------------------------------------------------
/tool-runners/itext/src/test/resources/test-documents/testPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/itext/src/test/resources/test-documents/testPDF.pdf
--------------------------------------------------------------------------------
/tool-runners/mutoolclean/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile
3 | FROM alpine:3 as MUTOOL_BUILDER
4 | MAINTAINER Jens Fischer
5 |
6 | # install necessary packages and compile MuPDF, clean up afterwards
7 | # include bash for debugging the build only
8 |
9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary
10 | #versions 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1
11 | ENV MUTOOL_VERSION 1.19.0
12 | RUN apk add --no-cache \
13 | git \
14 | make \
15 | pkgconfig \
16 | build-base \
17 | bash \
18 | && git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \
19 | && cd mupdf \
20 | && git submodule update --init \
21 | && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \
22 | && cd / \
23 | && rm -r mupdf \
24 | && apk del \
25 | git \
26 | make \
27 | pkgconfig \
28 | build-base
29 |
30 | FROM adoptopenjdk/openjdk11:alpine-slim
31 | COPY --from=MUTOOL_BUILDER /usr/local/bin /usr/local/bin
32 | COPY --from=MUTOOL_BUILDER /lib /lib
33 |
34 | COPY target/mutoolclean-1.0.0-SNAPSHOT.jar /mutoolclean-1.0.0-SNAPSHOT.jar
35 | ENTRYPOINT ["java","-jar","/mutoolclean-1.0.0-SNAPSHOT.jar"]
36 |
37 | #e.g.
38 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container
39 | # docker build -t mutool-clean-image .
40 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10
--------------------------------------------------------------------------------
/tool-runners/mutoolclean/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/mutooltext/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile
3 | FROM alpine:3 as MUPDF_BUILDER
4 | MAINTAINER Jens Fischer
5 |
6 | # install necessary packages and compile MuPDF, clean up afterwards
7 | # include bash for debugging the build only
8 |
9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary
10 | #versions 1.19.0 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1
11 | ENV MUTOOL_VERSION 1.19.0
12 | RUN apk add --no-cache \
13 | git \
14 | make \
15 | pkgconfig \
16 | build-base \
17 | bash \
18 | && git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \
19 | && cd mupdf \
20 | && git submodule update --init \
21 | && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \
22 | && cd / \
23 | && rm -r mupdf \
24 | && apk del \
25 | git \
26 | make \
27 | pkgconfig \
28 | build-base
29 |
30 | FROM adoptopenjdk/openjdk11:alpine-slim
31 | COPY --from=MUPDF_BUILDER /usr/local/bin /usr/local/bin
32 | COPY --from=MUPDF_BUILDER /lib /lib
33 |
34 | COPY target/mutooltext-1.0.0-SNAPSHOT.jar /mutooltext-1.0.0-SNAPSHOT.jar
35 | ENTRYPOINT ["java","-jar","/mutooltext-1.0.0-SNAPSHOT.jar"]
36 | #RUN apk update && apk add bash
37 | # e.g.
38 | # docker build -t mutool-text-image .
39 | # docker run -i -t --name mutool-text-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-text-image /opt/java/openjdk/bin/java -jar /mutooltotext-1.0.0-SNAPSHOT.jar /input /output/txt /output/table.csv 10
40 |
--------------------------------------------------------------------------------
/tool-runners/mutooltext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfbytes/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 |
3 | COPY target/pdfbytes-1.0.0-SNAPSHOT.jar /pdfbytes-1.0.0-SNAPSHOT.jar
4 |
5 | ENTRYPOINT ["java","-jar","/pdfbytes-1.0.0-SNAPSHOT.jar"]
6 |
--------------------------------------------------------------------------------
/tool-runners/pdfbytes/src/test/java/org/tallison/pdfutils/TestVersionUnpacker.java:
--------------------------------------------------------------------------------
1 | package org.tallison.pdfutils;
2 |
3 |
4 | import org.apache.tika.io.TikaInputStream;
5 | import org.junit.Test;
6 |
7 | import java.io.ByteArrayInputStream;
8 | import java.io.InputStream;
9 | import java.nio.charset.StandardCharsets;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 |
15 | public class TestVersionUnpacker {
16 |
17 | @Test
18 | public void testVersions() throws Exception {
19 | Path p = Paths.get(TestVersionUnpacker.class.getResource("/pdf-puzzle.pdf").toURI());
20 | System.out.println(PDFByteSniffer.getJson(p));
21 | }
22 |
23 | @Test
24 | public void testBackTracking() throws Exception {
25 | byte[] string = "%%%EO%%EOF%%EOF".getBytes(StandardCharsets.UTF_8);
26 | byte[] pattern = "%%EOF".getBytes(StandardCharsets.UTF_8);
27 | StreamSearcher streamSearcher = new StreamSearcher(pattern);
28 | InputStream is = new ByteArrayInputStream(string);
29 | System.out.println(streamSearcher.search(is));
30 | System.out.println(streamSearcher.search(is));
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/tool-runners/pdfbytes/src/test/resources/pdf-puzzle.pdf:
--------------------------------------------------------------------------------
1 | %PDF-1.1
2 |
3 | 1 0 obj
4 | <<
5 | /Type /Catalog
6 | /Outlines 2 0 R
7 | /Pages 3 0 R
8 | >>
9 | endobj
10 |
11 | 2 0 obj
12 | <<
13 | /Type /Outlines
14 | /Count 0
15 | >>
16 | endobj
17 |
18 | 3 0 obj
19 | <<
20 | /Type /Pages
21 | /Kids [4 0 R]
22 | /Count 1
23 | >>
24 | endobj
25 |
26 | 4 0 obj
27 | <<
28 | /Type /Page
29 | /Parent 3 0 R
30 | /MediaBox [0 0 612 792]
31 | /Contents 5 0 R
32 | /Resources <<
33 | /ProcSet [/PDF /Text]
34 | /Font << /F1 6 0 R >>
35 | >>
36 | >>
37 | endobj
38 |
39 | 5 0 obj
40 | <<
41 | /Length 89
42 | /Filter /ASCII85Decode
43 | >>
44 | stream
45 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbCgDI[]uD.RU,@;I&dE+EC!ATK:C<,*OE;u~>
46 | endstream
47 | endobj
48 |
49 | 6 0 obj
50 | <<
51 | /Type /Font
52 | /Subtype /Type1
53 | /Name /F1
54 | /BaseFont /Helvetica
55 | /Encoding /MacRomanEncoding
56 | >>
57 | endobj
58 |
59 | xref
60 | 0 7
61 | 0000000000 65535 f
62 | 0000000012 00000 n
63 | 0000000089 00000 n
64 | 0000000145 00000 n
65 | 0000000214 00000 n
66 | 0000000419 00000 n
67 | 0000000594 00000 n
68 | trailer
69 | <<
70 | /Size 7
71 | /Root 1 0 R
72 | >>
73 | startxref
74 | 718
75 | %%EOF
76 |
77 | 5 0 obj
78 | <<
79 | /Length 89
80 | /Filter /ASCII85Decode
81 | >>
82 | stream
83 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbD!=BJ[-=BJ[-=BJ[-=BJ[-=BI!p<,*OE;u~>
84 | endstream
85 | endobj
86 |
87 | xref
88 | 0 1
89 | 0000000000 65535 f
90 | 5 1
91 | 0000000935 00000 n
92 | trailer
93 | <<
94 | /Size 7
95 | /Root 1 0 R
96 | /Prev 718
97 | >>
98 | startxref
99 | 1110
100 | %%EOF
101 |
--------------------------------------------------------------------------------
/tool-runners/pdfchecker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 |
3 | #wrapper around: https://www.datalogics.com/products/pdf-tools/pdf-checker/
4 | #need to accept license, install it on linux and then tgz the binary
5 | #directory that is installed
6 |
7 | #I'm not including pdf-checker.tgz in my repo because of license
8 | #requirements
9 |
10 | RUN mkdir /pdfchecker-bin
11 |
12 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz
13 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz
14 |
15 | COPY target/pdfchecker-1.0.0-SNAPSHOT.jar /pdfchecker-1.0.0-SNAPSHOT.jar
16 | # to run against a single file:
17 | #/pdfchecker-bin/PDF_Checker/pdfchecker -j /pdfchecker-bin/PDF_Checker/CheckerProfiles/everything.json -i -s
18 | ENTRYPOINT ["java","-jar","/pdfchecker-1.0.0-SNAPSHOT.jar"]
--------------------------------------------------------------------------------
/tool-runners/pdfchecker/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfcpu/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile References: https://docs.docker.com/engine/reference/builder/
2 |
3 | # Start from a golang base image
4 | FROM golang:1.16.6 as builder
5 |
6 | # install
7 |
8 | #RUN go get github.com/pdfcpu/pdfcpu/cmd/...
9 | RUN git clone -b v0.3.12 --depth 1 https://github.com/pdfcpu/pdfcpu /pdfcpu
10 | RUN cd /pdfcpu && git checkout tags/v0.3.12 -b v0.3.12-tag
11 | #WORKDIR $GOPATH/src/github.com/pdfcpu/pdfcpu/cmd/pdfcpu
12 | RUN cd /pdfcpu/cmd/pdfcpu && CGO_ENABLED=0 GOOS=linux go build -a -o pdfcpu .
13 |
14 | ######## Start a new stage from scratch #######
15 |
16 | FROM alpine:latest
17 |
18 | RUN apk --no-cache add ca-certificates openjdk11
19 |
20 | WORKDIR /root/
21 |
22 | # Copy the Pre-built binary file from the previous stage
23 | COPY --from=builder /pdfcpu/cmd/pdfcpu .
24 |
25 | # Command to run the executable
26 | #CMD ["./pdfcpu"]
27 |
28 | COPY target/pdfcpu-1.0.0-SNAPSHOT.jar /pdfcpu-1.0.0-SNAPSHOT.jar
29 |
30 | ENTRYPOINT ["java","-jar","/pdfcpu-1.0.0-SNAPSHOT.jar"]
31 |
--------------------------------------------------------------------------------
/tool-runners/pdfcpu/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdffonts/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=21.12.0
8 | ENV POPPLER_DATA_VERSION=0.4.11
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 | && cd poppler-data-${POPPLER_DATA_VERSION} \
13 | && make install \
14 | && cd .. \
15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 | && cd poppler-${POPPLER_VERSION} \
18 | && mkdir build \
19 | && cd build \
20 | && cmake -DENABLE_BOOST=OFF ..\
21 | && make \
22 | && make install \
23 | && ldconfig
24 |
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 |
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 | libjpeg62-turbo libcairo2 libxml2 \
31 | fontconfig liblcms2-2 \
32 | libtiff5 -y
33 | # &&\
34 | #libopenjpeg5
35 | #libstdc++6 && \
36 | #addgroup -S appgroup && \
37 | #adduser -S appuser -G appgroup -h /work && \
38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 |
40 | COPY target/pdffonts-1.0.0-SNAPSHOT.jar /pdffonts-1.0.0-SNAPSHOT.jar
41 |
42 |
43 | ENTRYPOINT ["java","-jar","/pdffonts-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 |
46 |
--------------------------------------------------------------------------------
/tool-runners/pdffonts/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfid/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.1-slim-buster
2 |
3 | #TODO make more efficient by factoring out a build w git, etc
4 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 /pdfid
5 |
6 | RUN apt-get update && \
7 | apt-get install -y --no-install-recommends \
8 | openjdk-11-jre git
9 |
10 | RUN cd /pdfid && \
11 | git clone https://github.com/DidierStevens/DidierStevensSuite.git didierstevens && \
12 | cd /pdfid/didierstevens && \
13 | git checkout 5f81a8f7a8aac15b580413f6f3a2ec3d72c5d10c
14 |
15 | COPY target/pdfid-1.0.0-SNAPSHOT.jar /pdfid-1.0.0-SNAPSHOT.jar
16 |
17 | ENTRYPOINT ["java","-jar","/pdfid-1.0.0-SNAPSHOT.jar"]
18 |
19 | #for debugging
20 | #docker run -it --entrypoint /bin/bash
21 |
22 |
--------------------------------------------------------------------------------
/tool-runners/pdfid/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfimages/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=21.12.0
8 | ENV POPPLER_DATA_VERSION=0.4.11
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 | && cd poppler-data-${POPPLER_DATA_VERSION} \
13 | && make install \
14 | && cd .. \
15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 | && cd poppler-${POPPLER_VERSION} \
18 | && mkdir build \
19 | && cd build \
20 | && cmake -DENABLE_BOOST=OFF ..\
21 | && make \
22 | && make install \
23 | && ldconfig
24 |
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 |
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 | libjpeg62-turbo libcairo2 libxml2 \
31 | fontconfig liblcms2-2 \
32 | libtiff5 -y
33 | # &&\
34 | #libopenjpeg5
35 | #libstdc++6 && \
36 | #addgroup -S appgroup && \
37 | #adduser -S appuser -G appgroup -h /work && \
38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 |
40 | COPY target/pdfimages-1.0.0-SNAPSHOT.jar /pdfimages-1.0.0-SNAPSHOT.jar
41 |
42 |
43 | ENTRYPOINT ["java","-jar","/pdfimages-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 |
46 |
--------------------------------------------------------------------------------
/tool-runners/pdfimages/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfinfo/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=23.03.0
8 | ENV POPPLER_DATA_VERSION=0.4.12
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config \
10 | libfontconfig-dev libjpeg-dev libopenjp2-7-dev \
11 | #these are for temurin
12 | apt-transport-https gnupg -y
13 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
14 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
15 | && cd poppler-data-${POPPLER_DATA_VERSION} \
16 | && make install \
17 | && cd .. \
18 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
19 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
20 | && cd poppler-${POPPLER_VERSION} \
21 | && mkdir build \
22 | && cd build \
23 | && cmake -DENABLE_BOOST=OFF ..\
24 | && make \
25 | && make install \
26 | && ldconfig
27 |
28 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
29 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \
30 | && apt-get update && apt-get install temurin-11-jre -y
31 |
32 | COPY target/pdfinfo-1.0.0-SNAPSHOT.jar /pdfinfo-1.0.0-SNAPSHOT.jar
33 |
34 |
35 | ENTRYPOINT ["java","-jar","/pdfinfo-1.0.0-SNAPSHOT.jar"]
36 | #WORKDIR /work
37 |
38 |
--------------------------------------------------------------------------------
/tool-runners/pdfinfo/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfminerdump/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.1-slim-buster
2 |
3 | RUN pip install pdfminer.six==20201018
4 |
5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2
6 |
7 | RUN apt-get update && \
8 | apt-get install -y --no-install-recommends \
9 | openjdk-11-jre
10 |
11 | COPY target/pdfminerdump-1.0.0-SNAPSHOT.jar /pdfminerdump-1.0.0-SNAPSHOT.jar
12 |
13 | ENTRYPOINT ["java","-jar","/pdfminerdump-1.0.0-SNAPSHOT.jar"]
14 |
15 |
--------------------------------------------------------------------------------
/tool-runners/pdfminerdump/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfminertext/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.1-slim-buster
2 |
3 | RUN pip install pdfminer.six==20201018
4 |
5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2
6 |
7 | RUN apt-get update && \
8 | apt-get install -y --no-install-recommends \
9 | openjdk-11-jre
10 |
11 | COPY target/pdfminertext-1.0.0-SNAPSHOT.jar /pdfminertext-1.0.0-SNAPSHOT.jar
12 |
13 | ENTRYPOINT ["java","-jar","/pdfminertext-1.0.0-SNAPSHOT.jar"]
14 |
15 |
--------------------------------------------------------------------------------
/tool-runners/pdfminertext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends \
5 | pdfresurrect
6 |
7 | COPY target/pdfresurrect-1.0.0-SNAPSHOT.jar /pdfresurrect-1.0.0-SNAPSHOT.jar
8 |
9 | ENTRYPOINT ["java","-jar","/pdfresurrect-1.0.0-SNAPSHOT.jar"]
10 |
11 |
--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/env.properties:
--------------------------------------------------------------------------------
1 | TIKA_CONFIG=/config/tika-tika-config.xml
2 | #if on windows or mac, use host.docker.internal instead of localhost
3 | #make sure to include the table name after the final :
4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:2345/somedb?user=qwertyuiop&password=qwertyuiop
5 | NUM_THREADS=20
6 | IS_DELTA=false
--------------------------------------------------------------------------------
/tool-runners/pdfresurrect/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdftoppm/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=21.12.0
8 | ENV POPPLER_DATA_VERSION=0.4.11
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 | && cd poppler-data-${POPPLER_DATA_VERSION} \
13 | && make install \
14 | && cd .. \
15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 | && cd poppler-${POPPLER_VERSION} \
18 | && mkdir build \
19 | && cd build \
20 | && cmake -DENABLE_BOOST=OFF ..\
21 | && make \
22 | && make install \
23 | && ldconfig
24 |
25 | FROM amd64/openjdk:11.0.8-slim-buster
26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
28 |
29 | RUN apt-get update && apt-get install bash ca-certificates \
30 | libjpeg62-turbo libcairo2 libxml2 \
31 | fontconfig liblcms2-2 \
32 | libtiff5 -y
33 | # &&\
34 | #libopenjpeg5
35 | #libstdc++6 && \
36 | #addgroup -S appgroup && \
37 | #adduser -S appuser -G appgroup -h /work && \
38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
39 |
40 | COPY target/pdftoppm-1.0.0-SNAPSHOT.jar /pdftoppm-1.0.0-SNAPSHOT.jar
41 |
42 |
43 | ENTRYPOINT ["java","-jar","/pdftoppm-1.0.0-SNAPSHOT.jar"]
44 | #WORKDIR /work
45 |
--------------------------------------------------------------------------------
/tool-runners/pdftoppm/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdftops/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=21.12.0
8 | ENV POPPLER_DATA_VERSION=0.4.11
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 | && cd poppler-data-${POPPLER_DATA_VERSION} \
13 | && make install \
14 | && cd .. \
15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 | && cd poppler-${POPPLER_VERSION} \
18 | && mkdir build \
19 | && cd build \
20 | && cmake -DENABLE_BOOST=OFF ..\
21 | && make \
22 | && make install \
23 | && ldconfig
24 | #CMD tail -f /dev/null
25 |
26 | FROM amd64/openjdk:11.0.8-slim-buster
27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
29 |
30 | RUN apt-get update && apt-get install bash ca-certificates \
31 | libjpeg62-turbo libcairo2 libxml2 \
32 | fontconfig liblcms2-2 \
33 | libtiff5 -y
34 | # &&\
35 | #libopenjpeg5
36 | #libstdc++6 && \
37 | #addgroup -S appgroup && \
38 | #adduser -S appuser -G appgroup -h /work && \
39 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
40 |
41 | COPY target/pdftops-1.0.0-SNAPSHOT.jar /pdftops-1.0.0-SNAPSHOT.jar
42 |
43 |
44 | ENTRYPOINT ["java","-jar","/pdftops-1.0.0-SNAPSHOT.jar"]
45 | #WORKDIR /work
46 |
47 |
--------------------------------------------------------------------------------
/tool-runners/pdftops/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=DEBUG, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/pdftotext/Dockerfile:
--------------------------------------------------------------------------------
1 | #slight modification from:
2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker
3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile
4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER
5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to
6 | # migrate to 22.x
7 | ENV POPPLER_VERSION=21.12.0
8 | ENV POPPLER_DATA_VERSION=0.4.11
9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y
10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \
12 | && cd poppler-data-${POPPLER_DATA_VERSION} \
13 | && make install \
14 | && cd .. \
15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \
16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \
17 | && cd poppler-${POPPLER_VERSION} \
18 | && mkdir build \
19 | && cd build \
20 | && cmake -DENABLE_BOOST=OFF ..\
21 | && make \
22 | && make install \
23 | && ldconfig
24 | #CMD tail -f /dev/null
25 |
26 | FROM amd64/openjdk:11.0.8-slim-buster
27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib
28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local
29 |
30 | RUN apt-get update && apt-get install bash ca-certificates \
31 | libjpeg62-turbo libcairo2 libxml2 \
32 | fontconfig liblcms2-2 \
33 | libtiff5 -y
34 | # &&\
35 | #libopenjpeg5
36 | #libstdc++6 && \
37 | #addgroup -S appgroup && \
38 | #adduser -S appuser -G appgroup -h /work && \
39 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path
40 |
41 | COPY target/pdftotext-1.0.0-SNAPSHOT.jar /pdftotext-1.0.0-SNAPSHOT.jar
42 |
43 |
44 | ENTRYPOINT ["java","-jar","/pdftotext-1.0.0-SNAPSHOT.jar"]
45 | #WORKDIR /work
46 |
47 |
--------------------------------------------------------------------------------
/tool-runners/pdftotext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/polyfile/Dockerfile:
--------------------------------------------------------------------------------
1 | # this should be cleaned up dramatically
2 | # I tried to build polyfile in a base container and then
3 | # copy the right bits into the final image, but I couldn't figure
4 | # out how to get all the dependencies...so this is backwards
5 | # from the other docker files: build the java first, then
6 | # copy that jar into the build container for polyfile.
7 |
8 | FROM python:3.10.4-alpine3.15
9 | RUN apk add --no-cache \
10 | # git \
11 | bash \
12 | libffi-dev \
13 | zlib \
14 | build-base py-pip jpeg-dev zlib-dev \
15 | openjdk11-jre
16 | # && git clone -b v0.1.6 https://github.com/trailofbits/polyfile.git
17 |
18 |
19 | ENV LIBRARY_PATH=/lib:/usr/lib
20 |
21 | #RUN cd polyfile && pip3 install -e .
22 |
23 | RUN pip3 install polyfile==0.4.2
24 |
25 | COPY target/polyfile-1.0.0-SNAPSHOT.jar /polyfile-1.0.0-SNAPSHOT.jar
26 |
27 | ENTRYPOINT ["java","-jar","/polyfile-1.0.0-SNAPSHOT.jar"]
28 |
--------------------------------------------------------------------------------
/tool-runners/polyfile/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/qpdf/Dockerfile:
--------------------------------------------------------------------------------
1 | #fortunately the latest is available prebuilt (for now)
2 | # for future reference, start with something like this
3 | #curl g++ \
4 | ## && curl -o qpdf-10.0.1.tgz https://gigenet.dl.sourceforge.net/project/qpdf/qpdf/10.0.1/qpdf-10.0.1.tar.gz \
5 | # # && tar -xzvf qpdf-10.0.1.tgz
6 | #
7 | ##RUN cd qpdf-10.0.1 && \
8 | # # ./configure
9 | #
10 | ##RUN make install
11 |
12 | #alpine version dictates which qpdf version is available.
13 | #see e.g. https://pkgs.alpinelinux.org/packages?name=qpdf&branch=v3.13
14 | #to search for a match
15 | FROM alpine:edge
16 | RUN apk add --no-cache \
17 | qpdf=11.1.1-r0 \
18 | openjdk11-jre
19 |
20 |
21 | COPY target/qpdf-1.0.0-SNAPSHOT.jar /qpdf-1.0.0-SNAPSHOT.jar
22 |
23 | ENTRYPOINT ["java","-jar","/qpdf-1.0.0-SNAPSHOT.jar"]
24 |
25 |
26 | # e.g.
27 | # docker build -t qpdf-image .
28 |
29 | # docker run --name qpdf-container --network host --env-file env.properties -v /data/docs:/input -v /data/meta/qpdf/json:/output
--------------------------------------------------------------------------------
/tool-runners/qpdf/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/tika-client/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/tool-runners/tika/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM amd64/openjdk:11.0.8-slim-buster
2 |
3 |
4 | RUN apt-get update && apt-get install -y wget ghostscript
5 |
6 | RUN mkdir /pkg && cd /pkg && \
7 | wget https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && \
8 | tar -xzvf xpdf-tools-linux-4.03.tar.gz && \
9 | mv xpdf-tools-linux-4.03 /opt/xpdf-tools-linux-4.03
10 |
11 | RUN mkdir /usr/local/share/ghostscript && \
12 | mkdir /usr/local/share/ghostscript/fonts
13 |
14 | COPY tgzs/xpdf-t1fonts/*.pfb /usr/local/share/ghostscript/fonts/
15 |
16 | COPY xpdfrc /usr/local/etc/xpdfrc
17 |
18 | COPY xpdf /usr/local/share/xpdf
19 |
20 |
21 | ENV PATH "${PATH}:/opt/xpdf-tools-linux-4.03/bin64"
22 |
23 |
24 | COPY target/xpdffonts-1.0.0-SNAPSHOT.jar /xpdffonts-1.0.0-SNAPSHOT.jar
25 |
26 |
27 | ENTRYPOINT ["java","-jar","/xpdffonts-1.0.0-SNAPSHOT.jar"]
28 | #WORKDIR /work
29 |
30 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic/README:
--------------------------------------------------------------------------------
1 | Xpdf: Arabic support package
2 | ============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Arabic PDF files.
14 |
15 | Contents:
16 | - ISO-8859-6 encoding
17 |
18 | Place all of these files in a directory, typically:
19 |
20 | Unix - /usr/local/share/xpdf/arabic
21 | Win32 - C:\Program Files\xpdf\arabic
22 |
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 |
26 | Unix - /usr/local/etc/xpdfrc
27 | Win32 - C:\Program Files\xpdf\xpdfrc
28 |
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 |
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-arabic/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Arabic support package (2011-aug-15)
2 | unicodeMap ISO-8859-6 /usr/local/share/xpdf/arabic/ISO-8859-6.unicodeMap
3 | #----- end Arabic support package
4 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/README:
--------------------------------------------------------------------------------
1 | Xpdf: Chinese Simplified support package
2 | ========================================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2020-dec-22
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Chinese (Simplified) PDF files.
14 |
15 | Contents:
16 | - Adobe-GB1 character collection support
17 | - ISO-2022-CN encoding
18 | - EUC-CN encoding
19 | - GBK encoding
20 |
21 | Place all of these files in a directory, typically:
22 |
23 | Unix - /usr/local/share/xpdf/chinese-simplified
24 | Win32 - C:\Program Files\xpdf\chinese-simplified
25 |
26 | Add the contents of the "add-to-xpdfrc" file to your system-wide
27 | xpdfrc config file, which is typically:
28 |
29 | Unix - /usr/local/etc/xpdfrc
30 | Win32 - C:\Program Files\xpdf\xpdfrc
31 |
32 | Alternatively, on Unix systems you can add these lines to your
33 | personal xpdfrc file in $HOME/.xpdfrc.
34 |
35 | Make sure to edit the added lines to use the actual directory where
36 | the files were installed.
37 |
38 | To display PDF files that refer to non-embedded Chinese fonts, you
39 | will need to install a Chinese font. Free TrueType/OpenType fonts are
40 | available:
41 |
42 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gkai00mp.ttf.gz
43 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gbsn00lp.ttf.gz
44 | https://www.google.com/get/noto/
45 |
46 | After installing a Chinese font, add an appropriate "fontFileCC" line
47 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
48 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Chinese Simplified support package (2011-sep-02)
2 | cidToUnicode Adobe-GB1 /usr/local/share/xpdf/chinese-simplified/Adobe-GB1.cidToUnicode
3 | unicodeMap ISO-2022-CN /usr/local/share/xpdf/chinese-simplified/ISO-2022-CN.unicodeMap
4 | unicodeMap EUC-CN /usr/local/share/xpdf/chinese-simplified/EUC-CN.unicodeMap
5 | unicodeMap GBK /usr/local/share/xpdf/chinese-simplified/GBK.unicodeMap
6 | cMapDir Adobe-GB1 /usr/local/share/xpdf/chinese-simplified/CMap
7 | toUnicodeDir /usr/local/share/xpdf/chinese-simplified/CMap
8 | #fontFileCC Adobe-GB1 /usr/..../NotoSansCJKsc-Regular.otf
9 | #----- end Chinese Simplified support package
10 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/README:
--------------------------------------------------------------------------------
1 | Xpdf: Chinese Traditional support package
2 | =========================================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2020-dec-22
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Chinese (Traditional) PDF files.
14 |
15 | Contents:
16 | - Adobe-CNS1 character collection support
17 | - Big5 encoding
18 | - Big5ascii encoding (same as Big5, but includes 7-bit ASCII)
19 |
20 | Place all of these files in a directory, typically:
21 |
22 | Unix - /usr/local/share/xpdf/chinese-traditional
23 | Win32 - C:\Program Files\xpdf\chinese-traditional
24 |
25 | Add the contents of the "add-to-xpdfrc" file to your system-wide
26 | xpdfrc config file, which is typically:
27 |
28 | Unix - /usr/local/etc/xpdfrc
29 | Win32 - C:\Program Files\xpdf\xpdfrc
30 |
31 | Alternatively, on Unix systems you can add these lines to your
32 | personal xpdfrc file in $HOME/.xpdfrc.
33 |
34 | Make sure to edit the added lines to use the actual directory where
35 | the files were installed.
36 |
37 | To display PDF files that refer to non-embedded Chinese fonts, you
38 | will need to install a Chinese font. Free TrueType/OpenType fonts are
39 | available:
40 |
41 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bkai00mp.ttf.gz
42 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bsmi00lp.ttf.gz
43 | https://www.google.com/get/noto/
44 |
45 | After installing a Chinese font, add an appropriate "fontFileCC" line
46 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
47 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Chinese Traditional support package (2011-sep-02)
2 | cidToUnicode Adobe-CNS1 /usr/local/share/xpdf/chinese-traditional/Adobe-CNS1.cidToUnicode
3 | unicodeMap Big5 /usr/local/share/xpdf/chinese-traditional/Big5.unicodeMap
4 | unicodeMap Big5ascii /usr/local/share/xpdf/chinese-traditional/Big5ascii.unicodeMap
5 | cMapDir Adobe-CNS1 /usr/local/share/xpdf/chinese-traditional/CMap
6 | toUnicodeDir /usr/local/share/xpdf/chinese-traditional/CMap
7 | #fontFileCC Adobe-CNS1 /usr/..../NotoSansCJKtc-Regular.otf"
8 | #----- end Chinese Traditional support package
9 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic/README:
--------------------------------------------------------------------------------
1 | Xpdf: Cyrillic support package
2 | ==============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Cyrillic PDF files.
14 |
15 | Contents:
16 | - Bulgarian character names
17 | - KOI8-R encoding
18 |
19 | Place all of these files in a directory, typically:
20 |
21 | Unix - /usr/local/share/xpdf/cyrillic
22 | Win32 - C:\Program Files\xpdf\cyrillic
23 |
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 |
27 | Unix - /usr/local/etc/xpdfrc
28 | Win32 - C:\Program Files\xpdf\xpdfrc
29 |
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 |
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-cyrillic/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Cyrillic support package (2011-aug-15)
2 | nameToUnicode /usr/local/share/xpdf/cyrillic/Bulgarian.nameToUnicode
3 | unicodeMap KOI8-R /usr/local/share/xpdf/cyrillic/KOI8-R.unicodeMap
4 | #----- end Cyrillic support package
5 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek/README:
--------------------------------------------------------------------------------
1 | Xpdf: Greek support package
2 | ===========================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Greek PDF files.
14 |
15 | Contents:
16 | - Greek character names (alternates)
17 | - ISO-8859-7 encoding
18 |
19 | Place all of these files in a directory, typically:
20 |
21 | Unix - /usr/local/share/xpdf/greek
22 | Win32 - C:\Program Files\xpdf\greek
23 |
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 |
27 | Unix - /usr/local/etc/xpdfrc
28 | Win32 - C:\Program Files\xpdf\xpdfrc
29 |
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 |
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-greek/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Greek support package (2011-aug-15)
2 | nameToUnicode /usr/local/share/xpdf/greek/Greek.nameToUnicode
3 | unicodeMap ISO-8859-7 /usr/local/share/xpdf/greek/ISO-8859-7.unicodeMap
4 | #----- end Greek support package
5 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew/README:
--------------------------------------------------------------------------------
1 | Xpdf: Hebrew support package
2 | ============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Hebrew PDF files.
14 |
15 | Contents:
16 | - ISO-8859-8 encoding
17 | - Windows-1255 encoding
18 |
19 | Place all of these files in a directory, typically:
20 |
21 | Unix - /usr/local/share/xpdf/hebrew
22 | Win32 - C:\Program Files\xpdf\hebrew
23 |
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 |
27 | Unix - /usr/local/etc/xpdfrc
28 | Win32 - C:\Program Files\xpdf\xpdfrc
29 |
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 |
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-hebrew/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Hebrew support package (2011-aug-15)
2 | unicodeMap ISO-8859-8 /usr/local/share/xpdf/hebrew/ISO-8859-8.unicodeMap
3 | unicodeMap Windows-1255 /usr/local/share/xpdf/hebrew/Windows-1255.unicodeMap
4 | #----- end Hebrew support package
5 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese/README:
--------------------------------------------------------------------------------
1 | Xpdf: Japanese support package
2 | ==============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2020-dec-22
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Japanese PDF files.
14 |
15 | Contents:
16 | - Adobe-Japan1 character collection support
17 | - ISO-2022-JP encoding
18 | - EUC-JP encoding
19 | - Shift-JIS encoding
20 |
21 | Place all of these files in a directory, typically:
22 |
23 | Unix - /usr/local/share/xpdf/japanese
24 | Win32 - C:\Program Files\xpdf\japanese
25 |
26 | Add the contents of the "add-to-xpdfrc" file to your system-wide
27 | xpdfrc config file, which is typically:
28 |
29 | Unix - /usr/local/etc/xpdfrc
30 | Win32 - C:\Program Files\xpdf\xpdfrc
31 |
32 | Alternatively, on Unix systems you can add these lines to your
33 | personal xpdfrc file in $HOME/.xpdfrc.
34 |
35 | Make sure to edit the added lines to use the actual directory where
36 | the files were installed.
37 |
38 | To display PDF files that refer to non-embedded Japanese fonts, you
39 | will need to install a Japanese font. Free TrueType/OpenType fonts
40 | are available:
41 |
42 | http://packages.debian.org/stable/x11/ttf-kochi-mincho
43 | http://packages.debian.org/stable/x11/ttf-kochi-gothic
44 | https://www.google.com/get/noto/
45 |
46 | After installing a Japanese font, add an appropriate "fontFileCC" line
47 | to your xpdfrc file (see the sample in "add-to-xpdfrc").
48 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-japanese/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Japanese support package (2011-sep-02)
2 | cidToUnicode Adobe-Japan1 /usr/local/share/xpdf/japanese/Adobe-Japan1.cidToUnicode
3 | unicodeMap ISO-2022-JP /usr/local/share/xpdf/japanese/ISO-2022-JP.unicodeMap
4 | unicodeMap EUC-JP /usr/local/share/xpdf/japanese/EUC-JP.unicodeMap
5 | unicodeMap Shift-JIS /usr/local/share/xpdf/japanese/Shift-JIS.unicodeMap
6 | cMapDir Adobe-Japan1 /usr/local/share/xpdf/japanese/CMap
7 | toUnicodeDir /usr/local/share/xpdf/japanese/CMap
8 | #fontFileCC Adobe-Japan1 /usr/..../NotoSansCJKjp-Regular.otf
9 | #----- end Japanese support package
10 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean/README:
--------------------------------------------------------------------------------
1 | Xpdf: Korean support package
2 | ============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2020-dec-22
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002-2005 Glyph & Cog, LLC, and are licensed under the
10 | GNU General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Korean PDF files.
14 |
15 | Contents:
16 | - Adobe-Korea1 character collection support
17 | - Adobe-KR character collection support
18 | - ISO-2022-KR encoding
19 |
20 | Place all of these files in a directory, typically:
21 |
22 | Unix - /usr/local/share/xpdf/korean
23 | Win32 - C:\Program Files\xpdf\korean
24 |
25 | Add the contents of the "add-to-xpdfrc" file to your system-wide
26 | xpdfrc config file, which is typically:
27 |
28 | Unix - /usr/local/etc/xpdfrc
29 | Win32 - C:\Program Files\Xpdf\xpdfrc
30 |
31 | Alternatively, on Unix systems you can add these lines to your
32 | personal xpdfrc file in $HOME/.xpdfrc.
33 |
34 | Make sure to edit the added lines to use the actual directory where
35 | the files were installed.
36 |
37 | To display PDF files that refer to non-embedded Korean fonts, you will
38 | need to install a Korean font. Free TrueType/OpenType fonts are
39 | available:
40 |
41 | ftp://ftp.mizi.com/pub/baekmuk/baekmuk-ttf-2.1.tar.gz
42 | https://www.google.com/get/noto/
43 |
44 | After installing a Korean font, add appropriate "fontFileCC"
45 | lines to your xpdfrc file (see the sample in "add-to-xpdfrc").
46 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-korean/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Korean support package (2011-sep-02)
2 | cidToUnicode Adobe-Korea1 /usr/local/share/xpdf/korean/Adobe-Korea1.cidToUnicode
3 | cidToUnicode Adobe-KR /usr/local/share/xpdf/korean/Adobe-KR.cidToUnicode
4 | unicodeMap ISO-2022-KR /usr/local/share/xpdf/korean/ISO-2022-KR.unicodeMap
5 | cMapDir Adobe-Korea1 /usr/local/share/xpdf/korean/CMap
6 | cMapDir Adobe-KR /usr/local/share/xpdf/korean/CMap
7 | toUnicodeDir /usr/local/share/xpdf/korean/CMap
8 | #fontFileCC Adobe-Korea1 /usr/..../NotoSansCJKkr-Regular.otf
9 | #fontFileCC Adobe-KR /usr/..../NotoSansCJKkr-Regular.otf
10 | #----- end Korean support package
11 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2/README:
--------------------------------------------------------------------------------
1 | Xpdf: Latin2 support package
2 | ============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Latin2 PDF files.
14 |
15 | Contents:
16 | - Latin2 encoding
17 |
18 | Place all of these files in a directory, typically:
19 |
20 | Unix - /usr/local/share/xpdf/latin2
21 | Win32 - C:\Program Files\xpdf\latin2
22 |
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 |
26 | Unix - /usr/local/etc/xpdfrc
27 | Win32 - C:\Program Files\xpdf\xpdfrc
28 |
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 |
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-latin2/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Latin2 support package (2011-aug-15)
2 | unicodeMap Latin2 /usr/local/share/xpdf/latin2/Latin2.unicodeMap
3 | #----- end Latin2 support package
4 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/README:
--------------------------------------------------------------------------------
1 | This package contains two fonts:
2 |
3 | s050000l.pfb -- Symbol
4 | d050000l.pfb -- Zapf Dingbats
5 |
6 | These fonts are substitutes for the corresponding Base-14 fonts. They
7 | are part of the font set contributed to the ghostscript project by
8 | URW++ Design and Development Incorporated of Hamburg, Germany
9 | (http://www.urwpp.de/). They have been released under the GNU General
10 | Public License (GPL) v2 -- see the "COPYING" file.
11 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai/README:
--------------------------------------------------------------------------------
1 | Xpdf: Thai support package
2 | ==========================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Thai PDF files.
14 |
15 | Contents:
16 | - Thai character names
17 | - TIS-620 encoding
18 |
19 | Place all of these files in a directory, typically:
20 |
21 | Unix - /usr/local/share/xpdf/thai
22 | Win32 - C:\Program Files\xpdf\thai
23 |
24 | Add the contents of the "add-to-xpdfrc" file to your system-wide
25 | xpdfrc config file, which is typically:
26 |
27 | Unix - /usr/local/etc/xpdfrc
28 | Win32 - C:\Program Files\xpdf\xpdfrc
29 |
30 | Alternatively, on Unix systems you can add these lines to your
31 | personal xpdfrc file in $HOME/.xpdfrc.
32 |
33 | Make sure to edit the added lines to use the actual directory where
34 | the files were installed.
35 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-thai/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Thai support package (2011-aug-15)
2 | nameToUnicode /usr/local/share/xpdf/thai/Thai.nameToUnicode
3 | unicodeMap TIS-620 /usr/local/share/xpdf/thai/TIS-620.unicodeMap
4 | #----- end Thai support package
5 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish/README:
--------------------------------------------------------------------------------
1 | Xpdf: Turkish support package
2 | =============================
3 |
4 | Xpdf project: http://www.foolabs.com/xpdf/
5 | 2011-aug-15
6 |
7 | If this package includes CMap files, they contain their own copyright
8 | notices and distribution conditions. All other files in the package
9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU
10 | General Public License (GPL), version 2 or 3.
11 |
12 | This package provides support files needed to use the Xpdf tools with
13 | Turkish PDF files.
14 |
15 | Contents:
16 | - ISO-8859-9 encoding
17 |
18 | Place all of these files in a directory, typically:
19 |
20 | Unix - /usr/local/share/xpdf/turkish
21 | Win32 - C:\Program Files\xpdf\turkish
22 |
23 | Add the contents of the "add-to-xpdfrc" file to your system-wide
24 | xpdfrc config file, which is typically:
25 |
26 | Unix - /usr/local/etc/xpdfrc
27 | Win32 - C:\Program Files\xpdf\xpdfrc
28 |
29 | Alternatively, on Unix systems you can add these lines to your
30 | personal xpdfrc file in $HOME/.xpdfrc.
31 |
32 | Make sure to edit the added lines to use the actual directory where
33 | the files were installed.
34 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/tgzs/xpdf-turkish/add-to-xpdfrc:
--------------------------------------------------------------------------------
1 | #----- begin Turkish support package (2011-aug-15)
2 | unicodeMap ISO-8859-9 /usr/local/share/xpdf/turkish/ISO-8859-9.unicodeMap
3 | #----- end Turkish support package
4 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/arabic/ISO-8859-6.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 00a0 20
5 | 00a4 a4
6 | 00ad ad
7 | 02c6 5e
8 | 02dc 7e
9 | 060c 060c ac
10 | 061b 061b bb
11 | 061f 061f bf
12 | 0621 063a c1
13 | 0640 0652 e0
14 | 2013 2013 ad
15 | 2014 2014 2d2d
16 | 2018 2018 60
17 | 2019 2019 27
18 | 201a 201a 2c
19 | 201c 201c 22
20 | 201d 201d 22
21 | 201e 201e 2c2c
22 | 2026 2026 2e2e2e
23 | 2039 2039 3c
24 | 203a 203a 3e
25 | 2044 2044 2f
26 | 2122 2122 544d
27 | 2212 2212 2d
28 | f6f9 f6f9 4c
29 | f6fe f6fe 7e
30 | f721 f721 21
31 | f724 f724 24
32 | f726 f726 26
33 | f730 f739 30
34 | f73f f73f 3f
35 | f761 f77a 41
36 | fb00 fb00 6666
37 | fb01 fb01 6669
38 | fb02 fb02 666c
39 | fb03 fb03 666669
40 | fb04 fb04 66666c
41 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/GBpc-EUC-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap
%%DocumentNeededResources: ProcSet (CIDInit)
%%DocumentNeededResources: CMap (GBpc-EUC-UCS2C)
%%IncludeResource: ProcSet (CIDInit)
%%IncludeResource: CMap (GBpc-EUC-UCS2C)
%%BeginResource: CMap (GBpc-EUC-UCS2)
%%Title: (GBpc-EUC-UCS2)
%%Version: 4.002
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.
%%Copyright: All Rights Reserved.
%%Copyright:
%%Copyright: Patents Pending
%%Copyright:
%%Copyright: NOTICE: All information contained herein is the property
%%Copyright: of Adobe Systems Incorporated.
%%Copyright:
%%Copyright: Permission is granted for redistribution of this file
%%Copyright: provided this copyright notice is maintained intact and
%%Copyright: that the contents of this file are not altered in any
%%Copyright: way from its original form.
%%Copyright:
%%Copyright: PostScript and Display PostScript are trademarks of
%%Copyright: Adobe Systems Incorporated which may be registered in
%%Copyright: certain jurisdictions.
%%Copyright: -----------------------------------------------------------
%%EndComments
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/GBpc-EUC-UCS2C usecmap
/CIDSystemInfo 3 dict dup begin
/Registry (Adobe) def
/Ordering (GBpc_EUC_UCS2) def
/Supplement 2 def
end def
/CMapName /GBpc-EUC-UCS2 def
/CMapVersion 4.002 def
/CMapType 1 def
/WMode 0 def
1 beginbfrange
<006e0300>
endbfrange
endcmap
CMapName currentdict /CMap defineresource pop
end
end
%%EndResource
%%EOF
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 1990-2019 Adobe. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/B5pc-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap
%%DocumentNeededResources: ProcSet (CIDInit)
%%DocumentNeededResources: CMap (B5pc-UCS2)
%%IncludeResource: ProcSet (CIDInit)
%%IncludeResource: CMap (B5pc-UCS2C)
%%BeginResource: CMap (B5pc-UCS2)
%%Title: (B5pc-UCS2)
%%Version: 4.002
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.
%%Copyright: All Rights Reserved.
%%Copyright:
%%Copyright: Patents Pending
%%Copyright:
%%Copyright: NOTICE: All information contained herein is the property
%%Copyright: of Adobe Systems Incorporated.
%%Copyright:
%%Copyright: Permission is granted for redistribution of this file
%%Copyright: provided this copyright notice is maintained intact and
%%Copyright: that the contents of this file are not altered in any
%%Copyright: way from its original form.
%%Copyright:
%%Copyright: PostScript and Display PostScript are trademarks of
%%Copyright: Adobe Systems Incorporated which may be registered in
%%Copyright: certain jurisdictions.
%%Copyright: -----------------------------------------------------------
%%EndComments
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/B5pc-UCS2C usecmap
/CIDSystemInfo 3 dict dup begin
/Registry (Adobe) def
/Ordering (B5pc_UCS2) def
/Supplement 0 def
end def
/CMapName /B5pc-UCS2 def
/CMapVersion 4.002 def
/CMapType 1 def
/WMode 0 def
endcmap
CMapName currentdict /CMap defineresource pop
end
end
%%EndResource
%%EOF
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 1990-2019 Adobe. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/cyrillic/Bulgarian.nameToUnicode:
--------------------------------------------------------------------------------
1 | 0410 As
2 | 0411 Buki
3 | 0412 Wjedi
4 | 0413 Glagol
5 | 0414 Dobro
6 | 0415 Jest
7 | 0416 Schiwete
8 | 0417 Selmja
9 | 0418 Ische
10 | 0419 Ischebreve
11 | 041a Kako
12 | 041b Ljudi
13 | 041c Muislete
14 | 041d Nasche
15 | 041e On
16 | 041f Pakoj
17 | 0420 Rzui
18 | 0421 Slovo
19 | 0422 Twerdo
20 | 0423 Uk
21 | 0424 Fert
22 | 0425 Cherr
23 | 0426 Zui
24 | 0427 Tscherw
25 | 0428 Scha
26 | 0429 Schtscha
27 | 042a Jerr
28 | 042e Ju
29 | 042f Ja
30 | 0430 as
31 | 0431 buki
32 | 0432 wjedi
33 | 0433 glagol
34 | 0434 dobro
35 | 0435 jest
36 | 0436 schiwete
37 | 0437 selmja
38 | 0438 ische
39 | 0439 ischebreve
40 | 043a kako
41 | 043b ljudi
42 | 043c muislete
43 | 043d nasche
44 | 043e on
45 | 043f pakoj
46 | 0440 rzui
47 | 0441 slovo
48 | 0442 twerdo
49 | 0443 uk
50 | 0444 fert
51 | 0445 cherr
52 | 0446 zui
53 | 0447 tscherw
54 | 0448 scha
55 | 0449 schtscha
56 | 044a jerr
57 | 044e ju
58 | 044f ja
59 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/cyrillic/KOI8-R.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 9a
5 | 00a9 bf
6 | 00b0 9c
7 | 00b2 9d
8 | 00b7 9e
9 | 00f7 9f
10 | 02c6 5e
11 | 02da 9c
12 | 02dc 7e
13 | 0401 b3
14 | 0410 0411 e1
15 | 0412 f7
16 | 0413 e7
17 | 0414 0415 e4
18 | 0416 f6
19 | 0417 fa
20 | 0418 041f e9
21 | 0420 0423 f2
22 | 0424 e6
23 | 0425 e8
24 | 0426 e3
25 | 0427 fe
26 | 0428 fb
27 | 0429 fd
28 | 042a ff
29 | 042b f9
30 | 042c f8
31 | 042d fc
32 | 042e e0
33 | 042f f1
34 | 0430 0431 c1
35 | 0432 d7
36 | 0433 c7
37 | 0434 0435 c4
38 | 0436 d6
39 | 0437 da
40 | 0438 c9
41 | 0439 043f ca
42 | 0440 0443 d2
43 | 0444 c6
44 | 0445 c8
45 | 0446 c3
46 | 0447 de
47 | 0448 db
48 | 0449 dd
49 | 044a df
50 | 044b d9
51 | 044c d8
52 | 044d dc
53 | 044e c0
54 | 044f d1
55 | 0451 a3
56 | 2013 2d
57 | 2014 2d2d
58 | 2018 60
59 | 2019 27
60 | 201a 2c
61 | 201c 22
62 | 201d 22
63 | 201e 2c2c
64 | 2022 9e
65 | 2026 2e2e2e
66 | 2039 3c
67 | 203a 3e
68 | 2044 2f
69 | 2122 544d
70 | 2212 2d
71 | 2219 221a 95
72 | 2248 97
73 | 2264 2265 98
74 | 2320 93
75 | 2321 9b
76 | 2500 80
77 | 2502 81
78 | 250c 82
79 | 2510 83
80 | 2514 84
81 | 2518 85
82 | 251c 86
83 | 2524 87
84 | 252c 88
85 | 2534 89
86 | 253c 8a
87 | 2550 2552 a0
88 | 2553 2561 a4
89 | 2562 256c b4
90 | 2580 8b
91 | 2584 8c
92 | 2588 8d
93 | 258c 8e
94 | 2590 2593 8f
95 | 25a0 94
96 | fb00 6666
97 | fb01 6669
98 | fb02 666c
99 | fb03 666669
100 | fb04 66666c
101 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/greek/Greek.nameToUnicode:
--------------------------------------------------------------------------------
1 | 0396 Dzeta
2 | 039e Ksi
3 | 039f Omikron
4 | 03a7 Khi
5 | 03b2 betatwo
6 | 03b6 dzeta
7 | 03be ksi
8 | 03bf omikron
9 | 03c3 sigmafinal
10 | 03c6 phitwo
11 | 03c7 khi
12 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/greek/ISO-8859-7.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 a0
5 | 00a3 a3
6 | 00a6 00a9 a6
7 | 00ab 00ad ab
8 | 00b0 00b4 b0
9 | 00b5 ec
10 | 00b7 b7
11 | 00bb bb
12 | 00bd bd
13 | 02c6 5e
14 | 02da b0
15 | 02dc 7e
16 | 0374 b4
17 | 037e 3b
18 | 0384 038a b4
19 | 038c bc
20 | 038e 03a1 be
21 | 03a3 03ce d3
22 | 03d0 e2
23 | 03d1 e8
24 | 03d2 d5
25 | 03d3 be
26 | 03d4 db
27 | 03d5 f6
28 | 03d6 f0
29 | 03d7 eae1e9
30 | 03da d3d4
31 | 03db f3f4
32 | 03f0 ea
33 | 03f1 f1
34 | 03f2 63
35 | 03f3 6a
36 | 03f4 c8
37 | 03f5 e5
38 | 2013 ad
39 | 2014 af
40 | 2018 60
41 | 2019 a2
42 | 201a 2c
43 | 201b a1
44 | 201c 22
45 | 201d 22
46 | 201e 2c2c
47 | 2022 b7
48 | 2026 2e2e2e
49 | 2039 3c
50 | 203a 3e
51 | 2044 2f
52 | 20ac c5f5f1fe
53 | 20af c4f1f7
54 | 2122 544d
55 | 2126 d9
56 | 2206 c4
57 | 2212 2d
58 | 2219 b7
59 | fb00 6666
60 | fb01 6669
61 | fb02 666c
62 | fb03 666669
63 | fb04 66666c
64 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/hebrew/ISO-8859-8.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 00a0 20
5 | 00a2 00a9 a2
6 | 00ab 00b9 ab
7 | 00bb 00be bb
8 | 010c 43
9 | 010d 63
10 | 0131 69
11 | 0141 4c
12 | 0142 6c
13 | 0152 4f45
14 | 0153 6f65
15 | 0160 53
16 | 0161 73
17 | 0178 59
18 | 017d 5a
19 | 017e 7a
20 | 02c6 5e
21 | 02da b0
22 | 02dc 7e
23 | 05d0 05ea e0
24 | 05f0 e5e5
25 | 05f1 e5e9
26 | 05f2 e9e9
27 | 2013 ad
28 | 2014 2d2d
29 | 2018 60
30 | 2019 27
31 | 201a 2c
32 | 201c 22
33 | 201d 22
34 | 201e 2c2c
35 | 2022 b7
36 | 2026 2e2e2e
37 | 2039 3c
38 | 203a 3e
39 | 2044 2f
40 | 2122 544d
41 | 2212 2d
42 | f6f9 4c
43 | f6fa 4f45
44 | f6fc b0
45 | f6fd 53
46 | f6fe 7e
47 | f6ff 5a
48 | f721 21
49 | f724 24
50 | f726 26
51 | f730 f739 30
52 | f73f 3f
53 | f761 f77a 41
54 | f7a1 f7a2 a1
55 | f7bf bf
56 | f7e0 f7f6 c0
57 | f7f8 f7fe d8
58 | f7ff 59
59 | fb00 6666
60 | fb01 6669
61 | fb02 666c
62 | fb03 666669
63 | fb04 66666c
64 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/hebrew/Windows-1255.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 00a3 a0
5 | 00a5 00a9 a5
6 | 00ab 00b9 ab
7 | 00bb 00bf bb
8 | 00d7 aa
9 | 00f7 ba
10 | 010c 43
11 | 010d 63
12 | 0131 69
13 | 0141 4c
14 | 0142 6c
15 | 0152 4f45
16 | 0153 6f65
17 | 0160 53
18 | 0161 73
19 | 0178 59
20 | 017d 5a
21 | 017e 7a
22 | 0192 83
23 | 02c6 88
24 | 02da b0
25 | 02dc 98
26 | 05b0 05b9 c0
27 | 05bb 05c3 cb
28 | 05f0 05f4 d4
29 | 05d0 05ea e0
30 | 200e 200f fd
31 | 2013 2014 96
32 | 2018 2019 91
33 | 201a 82
34 | 201c 201d 93
35 | 201e 84
36 | 2020 86
37 | 2021 87
38 | 2022 95
39 | 2026 85
40 | 2030 89
41 | 2039 8b
42 | 203a 9b
43 | 2044 2f
44 | 20aa a4
45 | 20ac 80
46 | 2122 99
47 | 2212 2d
48 | f6f9 4c
49 | f6fa 4f45
50 | f6fc b0
51 | f6fd 53
52 | f6fe 7e
53 | f6ff 5a
54 | f721 21
55 | f724 24
56 | f726 26
57 | f730 f739 30
58 | f73f 3f
59 | f761 f77a 41
60 | f7a1 f7a2 a1
61 | f7bf bf
62 | fb00 6666
63 | fb01 6669
64 | fb02 666c
65 | fb03 666669
66 | fb04 66666c
67 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/japanese/CMap/90pv-RKSJ-UCS2:
--------------------------------------------------------------------------------
1 | %!PS-Adobe-3.0 Resource-CMap
%%DocumentNeededResources: ProcSet (CIDInit)
%%DocumentNeededResources: CMap (90pv-RKSJ-UCS2C)
%%IncludeResource: ProcSet (CIDInit)
%%IncludeResource: CMap (90pv-RKSJ-UCS2C)
%%BeginResource: CMap (90pv-RKSJ-UCS2)
%%Title: (90pv-RKSJ-UCS2)
%%Version: 4.002
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-1997 Adobe Systems Incorporated.
%%Copyright: All Rights Reserved.
%%Copyright:
%%Copyright: Patents Pending
%%Copyright:
%%Copyright: NOTICE: All information contained herein is the property
%%Copyright: of Adobe Systems Incorporated.
%%Copyright:
%%Copyright: Permission is granted for redistribution of this file
%%Copyright: provided this copyright notice is maintained intact and
%%Copyright: that the contents of this file are not altered in any
%%Copyright: way from its original form.
%%Copyright:
%%Copyright: PostScript and Display PostScript are trademarks of
%%Copyright: Adobe Systems Incorporated which may be registered in
%%Copyright: certain jurisdictions.
%%Copyright: -----------------------------------------------------------
%%EndComments
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/90pv-RKSJ-UCS2C usecmap
/CIDSystemInfo 3 dict dup begin
/Registry (Adobe) def
/Ordering (90pv_RKSJ_UCS2) def
/Supplement 2 def
end def
/CMapName /90pv-RKSJ-UCS2 def
/CMapVersion 4.002 def
/CMapType 1 def
/WMode 0 def
18 beginbfrange
<8591> <8591>
<85ab> <85ab>
<85ac> <85ac>
<85ad> <85ad>
<85bf> <85bf>
<85c0> <85c0>
<85c1> <85c1>
<865d> <865d>
<869e> <869e>
<86d4> <86d4> <21e6f87a>
<86d5> <86d5> <21e7f87a>
<86d6> <86d6> <21e9f87a>
<86ce> <86ce>
<8791> <8791> <592720dd>
<8792> <8792> <5c0f20dd>
<879d> <879d> <63a720dd>
<87fb> <87fb>
<87fc> <87fc>
endbfrange
endcmap
CMapName currentdict /CMap defineresource pop
end
end
%%EndResource
%%EOF
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/japanese/CMap/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 1990-2019 Adobe. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/korean/CMap/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 1990-2019 Adobe. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | Neither the name of Adobe nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/latin2/Latin2.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 00a0 20
5 | 00a4 a4
6 | 00a7 00a8 a7
7 | 00ad ad
8 | 00b0 b0
9 | 00b4 b4
10 | 00b8 b8
11 | 00c1 00c2 c1
12 | 00c4 c4
13 | 00c7 c7
14 | 00c9 c9
15 | 00cb cb
16 | 00cd 00ce cd
17 | 00d3 00d4 d3
18 | 00d6 00d7 d6
19 | 00da da
20 | 00dc 00dd dc
21 | 00df df
22 | 00e1 00e2 e1
23 | 00e4 e4
24 | 00e7 e7
25 | 00e9 e9
26 | 00eb eb
27 | 00ed 00ee ed
28 | 00f3 00f4 f3
29 | 00f6 00f7 f6
30 | 00fa fa
31 | 00fc 00fd fc
32 | 0102 c3
33 | 0103 e3
34 | 0104 a1
35 | 0105 b1
36 | 0106 c6
37 | 0107 e6
38 | 010c c8
39 | 010d e8
40 | 010e cf
41 | 010f ef
42 | 0110 d0
43 | 0111 f0
44 | 0118 ca
45 | 0119 ea
46 | 011a cc
47 | 011b ec
48 | 0131 69
49 | 0139 c5
50 | 013a e5
51 | 013d a5
52 | 013e b5
53 | 0141 a3
54 | 0142 b3
55 | 0143 d1
56 | 0144 f1
57 | 0147 d2
58 | 0148 f2
59 | 0150 d5
60 | 0151 f5
61 | 0152 4f45
62 | 0153 6f65
63 | 0154 c0
64 | 0155 e0
65 | 0158 d8
66 | 0159 f8
67 | 015a a6
68 | 015b b6
69 | 015e aa
70 | 015f ba
71 | 0160 a9
72 | 0161 b9
73 | 0162 de
74 | 0163 fe
75 | 0164 ab
76 | 0165 bb
77 | 016e d9
78 | 016f f9
79 | 0170 db
80 | 0171 fb
81 | 0178 59
82 | 0179 ac
83 | 017a bc
84 | 017b af
85 | 017c bf
86 | 017d ae
87 | 017e be
88 | 02c6 5e
89 | 02c7 b7
90 | 02d8 a2
91 | 02d9 ff
92 | 02da b0
93 | 02db b2
94 | 02dc 7e
95 | 02dd bd
96 | 2013 2013 ad
97 | 2014 2014 2d2d
98 | 2018 2018 60
99 | 2019 2019 27
100 | 201a 201a 2c
101 | 201c 201c 22
102 | 201d 201d 22
103 | 201e 201e 2c2c
104 | 2022 2022 b7
105 | 2026 2026 2e2e2e
106 | 2039 2039 3c
107 | 203a 203a 3e
108 | 2044 2044 2f
109 | 2122 2122 544d
110 | 2212 2212 2d
111 | f6f9 f6f9 4c
112 | f6fa f6fa 4f45
113 | f6fc f6fc b0
114 | f6fd f6fd 53
115 | f6fe f6fe 7e
116 | f6ff f6ff 5a
117 | f721 f721 21
118 | f724 f724 24
119 | f726 f726 26
120 | f730 f739 30
121 | f73f f73f 3f
122 | f761 f77a 41
123 | f7a1 f7a2 a1
124 | f7bf f7bf bf
125 | f7e0 f7f6 c0
126 | f7f8 f7fe d8
127 | f7ff f7ff 59
128 | fb00 fb00 6666
129 | fb01 fb01 6669
130 | fb02 fb02 666c
131 | fb03 fb03 666669
132 | fb04 fb04 66666c
133 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/thai/TIS-620.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 20
5 | 0131 69
6 | 0141 4c
7 | 0142 6c
8 | 0152 4f45
9 | 0153 6f65
10 | 0160 53
11 | 0161 73
12 | 0178 59
13 | 017d 5a
14 | 017e 7a
15 | 02c6 5e
16 | 02dc 7e
17 | 0e01 0e3a a1
18 | 0e3f 0e5b df
19 | 2013 2d2d
20 | 2014 2d2d
21 | 2018 60
22 | 2019 27
23 | 201a 2c
24 | 201c 22
25 | 201d 22
26 | 201e 2c2c
27 | 2022 2a
28 | 2026 2e2e2e
29 | 2039 3c
30 | 203a 3e
31 | 2044 2f
32 | 2122 544d
33 | 2212 2d
34 | f700 b0
35 | f701 f704 d4
36 | f705 f709 e8
37 | f70a f70e e8
38 | f70f ad
39 | f710 d1
40 | f711 ed
41 | f712 f717 e7
42 | f718 f71a d8
43 | fb00 6666
44 | fb01 6669
45 | fb02 666c
46 | fb03 666669
47 | fb04 66666c
48 |
--------------------------------------------------------------------------------
/tool-runners/xpdffonts/xpdf/turkish/ISO-8859-9.unicodeMap:
--------------------------------------------------------------------------------
1 | 000a 0a
2 | 000c 000d 0c
3 | 0020 007e 20
4 | 00a0 20
5 | 00a1 00ac a1
6 | 00ae 00cf ae
7 | 00d1 00dc d1
8 | 00df 00ef df
9 | 00f1 00fc f1
10 | 00ff ff
11 | 010c 43
12 | 010d 63
13 | 011e d0
14 | 011f f0
15 | 0130 dd
16 | 0131 fd
17 | 0141 4c
18 | 0142 6c
19 | 0152 4f45
20 | 0153 6f65
21 | 015e de
22 | 015f fe
23 | 0160 53
24 | 0161 73
25 | 0178 59
26 | 017d 5a
27 | 017e 7a
28 | 02c6 5e
29 | 02da b0
30 | 02dc 7e
31 | 2013 ad
32 | 2014 2d2d
33 | 2018 60
34 | 2019 27
35 | 201a 2c
36 | 201c 22
37 | 201d 22
38 | 201e 2c2c
39 | 2022 b7
40 | 2026 2e2e2e
41 | 2039 3c
42 | 203a 3e
43 | 2044 2f
44 | 2122 544d
45 | 2212 2d
46 | f6f9 4c
47 | f6fa 4f45
48 | f6fc b0
49 | f6fd 53
50 | f6fe 7e
51 | f6ff 5a
52 | f721 21
53 | f724 24
54 | f726 26
55 | f730 f739 30
56 | f73f 3f
57 | f761 f77a 41
58 | f7a1 f7a2 a1
59 | f7bf bf
60 | f7e0 f7f6 c0
61 | f7f8 f7fe d8
62 | f7ff 59
63 | fb00 6666
64 | fb01 6669
65 | fb02 666c
66 | fb03 666669
67 | fb04 66666c
68 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/db/ExtractsToDB.java:
--------------------------------------------------------------------------------
1 | package org.tallison.db;
2 |
3 | import java.io.IOException;
4 | import java.nio.file.Path;
5 | import java.nio.file.Paths;
6 |
7 | import org.apache.tika.exception.TikaConfigException;
8 | import org.apache.tika.pipes.pipesiterator.PipesIterator;
9 |
10 | public class ExtractsToDB {
11 |
12 | public static void main(String[] args) throws Exception {
13 | Path tikaConfigFile = Paths.get(args[0]);
14 |
15 | PipesIterator it = PipesIterator.build(tikaConfigFile);
16 |
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/db/FetchFilesFromDBPaths.java:
--------------------------------------------------------------------------------
1 | package org.tallison.db;
2 |
3 | import java.io.InputStream;
4 | import java.nio.file.Files;
5 | import java.nio.file.Path;
6 | import java.nio.file.Paths;
7 | import java.nio.file.StandardCopyOption;
8 | import java.util.regex.Matcher;
9 | import java.util.regex.Pattern;
10 |
11 | import org.apache.tika.metadata.Metadata;
12 | import org.apache.tika.pipes.FetchEmitTuple;
13 | import org.apache.tika.pipes.fetcher.Fetcher;
14 | import org.apache.tika.pipes.fetcher.FetcherManager;
15 | import org.apache.tika.pipes.pipesiterator.PipesIterator;
16 |
17 | public class FetchFilesFromDBPaths {
18 |
19 | public static void main(String[] args) throws Exception {
20 | Path tikaConfigFile = Paths.get("/Users/allison/Desktop/tika-config.xml");
21 | PipesIterator pipesIterator = PipesIterator.build(tikaConfigFile);
22 | Fetcher fetcher = FetcherManager.load(tikaConfigFile).getFetcher("s3f");
23 | Path outputRoot = Paths.get("/Users/allison/Desktop/clam-pdfs");
24 |
25 | for (FetchEmitTuple t : pipesIterator) {
26 | String clamav = t.getMetadata().get("clamav_detect");
27 | Matcher m = Pattern.compile("([0-9a-f]{10,})").matcher(t.getFetchKey().getFetchKey());
28 | String sha256 = "";
29 | if (m.find()) {
30 | sha256 = m.group(1);
31 | }
32 | Path targ = outputRoot.resolve(clamav).resolve(sha256);
33 | if (Files.isRegularFile(targ)) {
34 | continue;
35 | }
36 | Files.createDirectories(targ.getParent());
37 | try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), new Metadata())) {
38 | Files.copy(is, targ, StandardCopyOption.REPLACE_EXISTING);
39 | }
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/CSVLineCounter.java:
--------------------------------------------------------------------------------
1 | package org.tallison.digest;
2 |
3 | import java.nio.charset.StandardCharsets;
4 | import java.nio.file.Path;
5 | import java.nio.file.Paths;
6 |
7 | import org.apache.commons.csv.CSVFormat;
8 | import org.apache.commons.csv.CSVParser;
9 | import org.apache.commons.csv.CSVRecord;
10 |
11 | public class CSVLineCounter {
12 |
13 | public static void main(String[] args) throws Exception {
14 | Path path = Paths.get("/Users/allison/Desktop/size-pages-full.csv");
15 | int c = 0;
16 | for (CSVRecord r : CSVParser.parse(path, StandardCharsets.UTF_8, CSVFormat.EXCEL)) {
17 | c++;
18 | }
19 | System.out.println(c);
20 |
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/DigestChecker.java:
--------------------------------------------------------------------------------
1 | package org.tallison.digest;
2 |
3 | import java.io.BufferedWriter;
4 | import java.io.File;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import java.nio.charset.StandardCharsets;
8 | import java.nio.file.Files;
9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | import java.util.concurrent.atomic.AtomicInteger;
12 |
13 | import org.apache.commons.codec.digest.DigestUtils;
14 |
15 | public class DigestChecker {
16 |
17 | AtomicInteger totalChecked = new AtomicInteger(0);
18 | public static void main(String[] args) throws Exception {
19 | Path dir = Paths.get(args[0]);
20 | try (BufferedWriter writer =
21 | Files.newBufferedWriter(Paths.get(args[1]), StandardCharsets.UTF_8)) {
22 | DigestChecker digestChecker = new DigestChecker();
23 | digestChecker.execute(dir, writer);
24 | }
25 | }
26 |
27 | private void execute(Path rootDir, BufferedWriter writer) {
28 | processDir(rootDir, writer);
29 | System.err.println("completed successfully");
30 | }
31 |
32 | private void processDir(Path path, BufferedWriter writer) {
33 | for (File f : path.toFile().listFiles()) {
34 | if (f.isFile()) {
35 | processFile(f, writer);
36 | } else {
37 | processDir(f.toPath(), writer);
38 | }
39 | }
40 | }
41 |
42 | private void processFile(File f, BufferedWriter writer) {
43 | String name = f.getName();
44 | String digest = null;
45 | try (InputStream is = Files.newInputStream(f.toPath())) {
46 | digest = DigestUtils.sha256Hex(is);
47 | } catch (IOException e) {
48 | e.printStackTrace();
49 | }
50 | if (! name.equals(digest)) {
51 | try {
52 | writer.write(name + "\t" + digest + "\n");
53 | } catch (IOException e) {
54 | e.printStackTrace();
55 | }
56 | }
57 | int checked = totalChecked.incrementAndGet();
58 | if (checked % 1000 == 0) {
59 | System.err.println(checked + " files processed");
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/FileListNormalizer.java:
--------------------------------------------------------------------------------
1 | package org.tallison.digest;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.BufferedWriter;
5 | import java.io.File;
6 | import java.nio.charset.StandardCharsets;
7 | import java.nio.file.Files;
8 | import java.nio.file.Path;
9 | import java.nio.file.Paths;
10 | import java.util.regex.Matcher;
11 | import java.util.regex.Pattern;
12 |
13 | public class FileListNormalizer {
14 |
15 | public static void main(String[] args) throws Exception {
16 | Path dir = Paths.get("PATH");
17 | for (File f : dir.toFile().listFiles()) {
18 | if (f.getName().endsWith("-normed.txt")) {
19 | continue;
20 | }
21 | Path output = dir.resolve(f.getName().replace(".txt", "-normed.txt"));
22 | try (BufferedWriter w = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
23 | try (BufferedReader r = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
24 | String line = r.readLine();
25 | Matcher m =
26 | Pattern.compile("([a-f0-9]{2,2}/[a-f0-9]{2,2}/[a-f0-9]+)").matcher("");
27 | while (line != null) {
28 | m.reset(line);
29 | if (m.find()) {
30 | System.out.println(m.group(1));
31 | w.write(m.group(1) + "\n");
32 | } else {
33 | System.err.println("wtf: "+line);
34 | }
35 | line = r.readLine();
36 | }
37 | }
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/digest/S3ListCompare.java:
--------------------------------------------------------------------------------
1 | package org.tallison.digest;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.Writer;
5 | import java.nio.file.Files;
6 | import java.nio.file.Path;
7 | import java.nio.file.Paths;
8 | import java.util.HashSet;
9 | import java.util.Set;
10 |
11 | import com.amazonaws.auth.AWSCredentialsProvider;
12 | import com.amazonaws.auth.profile.ProfileCredentialsProvider;
13 | import com.amazonaws.services.s3.AmazonS3;
14 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
15 | import com.amazonaws.services.s3.iterable.S3Objects;
16 | import com.amazonaws.services.s3.model.S3ObjectSummary;
17 |
18 | public class S3ListCompare {
19 | public static void main(String[] args) throws Exception {
20 | Path pwd = Paths.get("");
21 | Path oneMillion = pwd.resolve("");
22 | Path s3 = pwd.resolve("s3-files.txt");
23 | Set eval = load(oneMillion);
24 | Set s3list = load(s3);
25 | System.out.println(eval.size());
26 | System.out.println(s3list.size());
27 | int missing = 0;
28 | for (String k : eval) {
29 | if (! s3list.contains(k)) {
30 | System.out.println("file missing in s3: "+ k);
31 | missing++;
32 | }
33 | }
34 |
35 | System.out.println("missing: " + missing);
36 | }
37 |
38 | private static Set load(Path p) throws Exception {
39 | Set set = new HashSet<>();
40 | try (BufferedReader r = Files.newBufferedReader(p)) {
41 | String line = r.readLine();
42 | while (line != null) {
43 | String[] bits = line.split("\\s+");
44 | String k = bits[0].trim();
45 | k = k.replaceFirst("", "");
46 | k = k.replaceFirst("", "");
47 | k = k.trim();
48 | set.add(k);
49 | line = r.readLine();
50 | }
51 | }
52 | return set;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/utils-general/src/main/java/org/tallison/filter/CopyByMime.java:
--------------------------------------------------------------------------------
1 | package org.tallison.filter;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.nio.file.Files;
6 | import java.nio.file.Path;
7 | import java.nio.file.Paths;
8 |
9 | import org.apache.tika.Tika;
10 |
11 | public class CopyByMime {
12 |
13 | public static void main(String[] args) {
14 | Path src = Paths.get(args[0]);
15 | Path target = Paths.get(args[1]);
16 | String mimePart = "nitf";
17 | Tika tika = new Tika();
18 | processDirectory(mimePart, src, src, target, tika);
19 |
20 | }
21 |
22 | private static void processDirectory(String mimePart, Path root, Path path, Path targetRoot,
23 | Tika tika) {
24 | for (File f : path.toFile().listFiles()) {
25 | if (f.isDirectory()) {
26 | processDirectory(mimePart, root, f.toPath(), targetRoot, tika);
27 | } else {
28 | processFile(mimePart, root, f.toPath(), targetRoot, tika);
29 | }
30 | }
31 | }
32 |
33 | private static void processFile(String mimePart, Path root, Path path, Path targetRoot,
34 | Tika tika) {
35 |
36 | try {
37 | String type = tika.detect(path);
38 | if (type.contains(mimePart)) {
39 | Path rel = root.relativize(path);
40 | Path target= targetRoot.resolve(rel);
41 | System.out.println(type + " : " + path);
42 | System.out.println(path + "-> " + target);
43 | if (!Files.isDirectory(target.getParent())) {
44 | Files.createDirectories(target.getParent());
45 | }
46 | Files.copy(path, target);
47 | }
48 | } catch (IOException e) {
49 | e.printStackTrace();
50 | }
51 |
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/utils-general/src/test/java/org/tallison/pdf/utils/TestPDFSplitter.java:
--------------------------------------------------------------------------------
1 | package org.tallison.pdf.utils;
2 |
3 |
4 | import java.nio.file.Path;
5 | import java.nio.file.Paths;
6 |
7 | import org.junit.Ignore;
8 | import org.junit.Test;
9 |
10 |
11 | public class TestPDFSplitter {
12 |
13 | @Test
14 | @Ignore
15 | public void testSimple() throws Exception {
16 |
17 | PDFSplitter.main(new String[]{
18 | "/docs",
19 | "/single-pages",
20 | "10"});
21 | }
22 | }
23 |
--------------------------------------------------------------------------------