├── .gitignore ├── LICENSE.txt ├── README.md ├── batchlite ├── pom.xml └── src │ └── main │ ├── java │ └── org │ │ └── tallison │ │ └── batchlite │ │ ├── AbstractDirectoryProcessor.java │ │ ├── AbstractFileProcessor.java │ │ ├── CommandlineFileProcessor.java │ │ ├── CommandlineFileToFileProcessor.java │ │ ├── CommandlineStdoutToFileProcessor.java │ │ ├── ConfigSrc.java │ │ ├── FileProcessResult.java │ │ ├── FileProcessor.java │ │ ├── FileToFileProcessor.java │ │ ├── MetadataWriter.java │ │ ├── ProcessExecutor.java │ │ ├── StreamEater.java │ │ ├── example │ │ ├── FileCommandExample.java │ │ ├── PDFChecker.java │ │ └── PDFStdoutChecker.java │ │ └── writer │ │ ├── CSVMetadataWriter.java │ │ ├── JDBCMetadataWriter.java │ │ ├── JSONMetadataWriter.java │ │ ├── MetadataWriterFactory.java │ │ ├── PathResultPair.java │ │ └── WriterResult.java │ └── resources │ └── log4j2.xml ├── commoncrawl-fetcher ├── pom.xml └── src │ ├── README.txt │ ├── main │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ ├── cc │ │ │ ├── CCFileFetcher.java │ │ │ ├── CCIndexReaderCounter.java │ │ │ ├── CCIndexWGetter.java │ │ │ ├── IndexGrep.java │ │ │ ├── Refetcher.java │ │ │ ├── S3IndexGetter.java │ │ │ ├── fetcherlite │ │ │ │ ├── CCFileFetcherLiteCLI.java │ │ │ │ ├── FetchLiteRecordProcessor.java │ │ │ │ ├── FetcherLiteConfig.java │ │ │ │ └── FileFromCCWarcFetcher.java │ │ │ ├── index │ │ │ │ ├── AbstractRecordProcessor.java │ │ │ │ ├── CCIndexRecord.java │ │ │ │ ├── CCIndexWGetter.java │ │ │ │ ├── CompositeRecordFilter.java │ │ │ │ ├── IndexFileChecker.java │ │ │ │ ├── IndexRecordProcessor.java │ │ │ │ ├── LatLongAdder.java │ │ │ │ ├── MimeCounter.java │ │ │ │ ├── RecordFilter.java │ │ │ │ └── db │ │ │ │ │ ├── DBIndexer.java │ │ │ │ │ └── DBIndexerCLI.java │ │ │ └── pipes │ │ │ │ └── CCIndexPipesIterator.java │ │ │ └── util │ │ │ ├── DBUtil.java │ │ │ ├── HTTPFetchWrapper.java │ │ │ ├── HostUpsert.java │ │ │ ├── MapUtil.java │ │ │ └── ReloadFetchStatusTable.java │ └── resources │ │ ├── log4j2.xml │ │ ├── selectFetchAndFetchStatus.sql │ │ ├── selectFilesToFetchFromCC.sql │ │ ├── selectFilesToFetchPerWarcId.sql │ │ ├── selectIndexedAndFetchedData.sql │ │ ├── selectIndexedData.sql │ │ └── selectWarcFileIdsToFetchFromCC.sql │ └── test │ ├── java │ ├── CCIndexRecordTest.java │ ├── CompositeRecordFilterTest.java │ └── FetcherTest.java │ └── resources │ ├── examples │ ├── mpeg-filters.json │ ├── tika-config-fetch-fs.xml │ ├── tika-config-index-fs.xml │ ├── tika-config-index-s3.xml │ └── tika-config-refetch-fs.xml │ └── test-documents │ ├── mime-filters-av.json │ ├── mime-filters.json │ ├── pdf-filter-sample.json │ ├── pdf-filter.json │ ├── status-filter.json │ └── status-sample-filter.json ├── ingest-jdbc ├── pom.xml └── src │ └── main │ └── java │ └── org │ └── tallison │ └── ingest │ └── arlington │ └── ArlingtonIngest.java ├── ingest ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── ingest │ │ │ ├── CompositeFeatureMapper.java │ │ │ ├── FeatureMapper.java │ │ │ ├── IngesterCLI.java │ │ │ ├── IngesterToCSVCLI.java │ │ │ ├── mappers │ │ │ ├── ArlingtonMapper.java │ │ │ ├── CPUMapper.java │ │ │ ├── CaradocMapper.java │ │ │ ├── ClamAVMapper.java │ │ │ ├── ESUtil.java │ │ │ ├── MultiCompareMapper.java │ │ │ ├── MutoolMapper.java │ │ │ ├── PDFBytesMapper.java │ │ │ ├── PDFCheckerMapper.java │ │ │ ├── PDFFontsMapper.java │ │ │ ├── PDFInfoFeatureMapper.java │ │ │ ├── PDFMinerMapper.java │ │ │ ├── PDFResurrectMapper.java │ │ │ ├── ProfileFeatureMapper.java │ │ │ ├── QPDFFeatureMapper.java │ │ │ ├── StatusFeatureMapper.java │ │ │ ├── TikaFeatureMapper.java │ │ │ ├── UniverseMapper.java │ │ │ └── XPDFFontsMapper.java │ │ │ ├── qpdf │ │ │ ├── QPDFJsonExtractor.java │ │ │ └── QPDFResults.java │ │ │ ├── qpdf10 │ │ │ └── qpdf │ │ │ │ ├── QPDFJsonExtractor.java │ │ │ │ └── QPDFResults.java │ │ │ └── utils │ │ │ ├── CSVsToPostgres.java │ │ │ ├── ESToCSV.java │ │ │ └── FindMissing.java │ └── resources │ │ ├── META-INF │ │ └── services │ │ │ └── org.tallison.ingest.FeatureMapper │ │ ├── common-keys.txt │ │ ├── important-int-keys.txt │ │ ├── log4j.properties │ │ ├── observatory-mappings.json │ │ ├── selectStar-dev.sql │ │ ├── selectStar-lite.sql │ │ ├── selectStar-minimal.sql │ │ ├── selectStar-sample.sql │ │ └── selectStar.sql │ └── test │ ├── java │ └── org │ │ └── tallison │ │ └── ingest │ │ └── mappers │ │ ├── ArlingtonMapperTest.java │ │ ├── MapperTest.java │ │ ├── PDFCheckerMapperTest.java │ │ ├── PDFFontsMapperTest.java │ │ ├── PDFInfoMapperTest.java │ │ ├── QPDF10JsonExtractorTest.java │ │ ├── QPDFJsonExtractorTest.java │ │ └── XPDFFontsMapperTest.java │ └── resources │ └── test-documents │ ├── GHOSTSCRIPT-687771-0.pdf.json │ ├── GHOSTSCRIPT-690371-0.pdf.json │ ├── GHOSTSCRIPT-702993-0.pdf.json │ ├── arlington │ ├── GHOSTSCRIPT-687499-0.pdf.txt │ ├── GHOSTSCRIPT-687647-0.pdf.txt │ └── GHOSTSCRIPT-688076-1.pdf.txt │ ├── pdfchecker │ ├── GHOSTSCRIPT-696838-0.zip-0.pdf.json │ └── fonts-PDFBOX-1002-2.pdf.json │ ├── pdffonts │ └── test-basic.txt │ ├── qpdfv11 │ └── qpdf.json │ ├── simple.json │ ├── types.json │ └── xpdffonts │ └── test-basic.txt ├── pom.xml ├── simple-ingester ├── pom.xml └── src │ └── main │ └── java │ └── org │ └── tallison │ ├── ingester │ └── IngesterCLI.java │ └── tika │ └── parser │ ├── ConcatenatingParser.java │ └── TikaServerClient.java ├── tika-addons ├── pom.xml ├── tika-eval-multicomparer │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── org │ │ └── tallison │ │ └── tika │ │ └── eval │ │ └── multi │ │ ├── ListGenerator.java │ │ ├── MultiCompareWorker.java │ │ └── MultiComparerCLI.java ├── tika-pipes-reporter │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── org │ │ └── tallison │ │ └── tika │ │ └── pipes │ │ └── TikaPipesReporter.java └── tika-server-fuzzer │ ├── pom.xml │ └── src │ └── main │ └── java │ └── FuzzClient.java ├── tika-containers ├── pom.xml ├── tika-arlington │ ├── Dockerfile │ ├── my-tika-config.xml │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── org │ │ └── tallison │ │ └── observatory │ │ └── RegexCaptureParser.java ├── tika-exiftool │ ├── Dockerfile │ ├── my-tika-config.xml │ └── pom.xml ├── tika-pdfchecker │ ├── Dockerfile │ ├── my-tika-config.xml │ ├── pom.xml │ ├── src │ │ ├── main │ │ │ ├── java │ │ │ │ └── org │ │ │ │ │ └── tallison │ │ │ │ │ └── tika │ │ │ │ │ └── parsers │ │ │ │ │ └── pdfchecker │ │ │ │ │ └── PDFChecker.java │ │ │ └── resources │ │ │ │ └── META-INF │ │ │ │ └── services │ │ │ │ └── org.apache.tika.parser.Parser │ │ └── test │ │ │ ├── java │ │ │ └── TikaPDFToTextTest.java │ │ │ └── resources │ │ │ └── test-documents │ │ │ └── testPDF.pdf │ └── tika-server-core-2.0.0-SNAPSHOT.jar ├── tika-pdfium │ └── my-args.gn ├── tika-pdfjs-selenium │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── FirefoxSeleniumExample.java ├── tika-pdfjs │ ├── Dockerfile │ ├── js │ │ └── my-getinfo.js │ ├── my-tika-config.xml │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── org │ │ │ └── tallison │ │ │ └── observatory │ │ │ └── pdfjs │ │ │ └── PDFJSOutputParser.java │ │ └── test │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── observatory │ │ │ └── pdfjs │ │ │ └── PDFJSOutputParserTest.java │ │ └── resources │ │ └── test-documents │ │ ├── test-basic.txt │ │ ├── test-xmp.txt │ │ └── test-xmp2.txt ├── tika-pdfspelunker │ ├── Dockerfile │ ├── my-tika-config.xml │ ├── pom.xml │ └── src │ │ ├── main │ │ ├── java │ │ │ └── org │ │ │ │ └── tallison │ │ │ │ └── tika │ │ │ │ ├── parsers │ │ │ │ ├── image │ │ │ │ │ ├── ICCImageParser.java │ │ │ │ │ └── IccMaxParser.java │ │ │ │ └── pdf │ │ │ │ │ ├── ImageGraphicsEngine.java │ │ │ │ │ ├── PDFImageStreamUtil.java │ │ │ │ │ ├── PDFSpelunker.java │ │ │ │ │ └── ParseState.java │ │ │ │ └── spelunker │ │ │ │ └── tools │ │ │ │ └── ExtractICCs.java │ │ └── resources │ │ │ ├── META-INF │ │ │ └── services │ │ │ │ └── org.apache.tika.parser.Parser │ │ │ └── org │ │ │ └── apache │ │ │ └── tika │ │ │ └── mime │ │ │ └── custom-mimetypes.xml │ │ └── test │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── tika │ │ │ └── parsers │ │ │ ├── image │ │ │ └── ICCImageParserTest.java │ │ │ └── pdf │ │ │ └── PDFSpelunkerTest.java │ │ └── resources │ │ ├── config │ │ └── my-tika-config.xml │ │ └── test-documents │ │ ├── baseball.jpg │ │ ├── icc-reports │ │ ├── non-compliant1.txt │ │ ├── not-icc1.txt │ │ └── not-icc2.txt │ │ └── testPDF.pdf ├── tika-pdftotext │ ├── Dockerfile │ ├── my-tika-config.xml │ └── pom.xml ├── tika-pipes-pdfinfo │ ├── Dockerfile │ ├── log4j2.xml │ ├── my-tika-config.xml │ ├── pipes-log4j2.xml │ └── pom.xml ├── tika-pipes-siegfried │ ├── Dockerfile │ ├── log4j2.xml │ ├── my-tika-config.xml │ ├── pipes-log4j2.xml │ └── pom.xml └── tika-pypdf2 │ ├── Dockerfile │ ├── my-tika-config.xml │ ├── pom.xml │ └── scripts │ └── PyPDF2Cli.py ├── tool-runners ├── arlington │ ├── Dockerfile │ ├── env.properties │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── arlington │ │ │ └── TestGrammarRunner.java │ │ └── resources │ │ └── log4j.properties ├── caradoc │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── caradoc │ │ │ └── Caradoc.java │ │ └── resources │ │ └── log4j.properties ├── clamav │ ├── Dockerfile │ ├── conf │ │ ├── clam.conf │ │ └── freshclam.conf │ ├── exec.sh │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── clamav │ │ │ └── ClamAVRunner.java │ │ └── resources │ │ └── log4j.properties ├── env.properties ├── fileprofiler │ ├── Dockerfile │ ├── README.txt │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── profiler │ │ │ └── FileProfiler.java │ │ └── resources │ │ └── log4j2.xml ├── gstotext │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── gstotext │ │ │ └── GhostScriptToTextRunner.java │ │ └── resources │ │ └── log4j.properties ├── itext │ ├── README.md │ ├── pom.xml │ └── src │ │ ├── main │ │ ├── java │ │ │ └── org │ │ │ │ └── tallison │ │ │ │ └── tika │ │ │ │ └── parser │ │ │ │ └── itext │ │ │ │ └── ITextParser.java │ │ └── resources │ │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.tika.parser.Parser │ │ └── test │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── tika │ │ │ └── parser │ │ │ └── itext │ │ │ └── ITextParserTest.java │ │ └── resources │ │ └── test-documents │ │ └── testPDF.pdf ├── mutoolclean │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── mutool │ │ │ └── MutoolClean.java │ │ └── resources │ │ └── log4j.properties ├── mutooltext │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── mutool │ │ │ └── MutoolTextRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdfbytes │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── org │ │ │ └── tallison │ │ │ └── pdfutils │ │ │ ├── PDFByteSniffer.java │ │ │ ├── PDFVersionator.java │ │ │ └── StreamSearcher.java │ │ └── test │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── pdfutils │ │ │ └── TestVersionUnpacker.java │ │ └── resources │ │ └── pdf-puzzle.pdf ├── pdfchecker │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfchecker │ │ │ └── PDFCheckerRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdfcpu │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfcpu │ │ │ └── PDFCPURunner.java │ │ └── resources │ │ └── log4j.properties ├── pdffonts │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdffonts │ │ │ └── PDFFontsRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdfid │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfid │ │ │ └── PDFIdRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdfimages │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfimages │ │ │ └── PDFImagesRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdfinfo │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfinfo │ │ │ └── PDFInfo.java │ │ └── resources │ │ └── log4j.properties ├── pdfminerdump │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfminer │ │ │ └── PDFMinerDump.java │ │ └── resources │ │ └── log4j.properties ├── pdfminertext │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfminer │ │ │ └── PDFMinerText.java │ │ └── resources │ │ └── log4j.properties ├── pdfresurrect │ ├── Dockerfile │ ├── env.properties │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdfresurrect │ │ │ └── PDFResurrect.java │ │ └── resources │ │ └── log4j.properties ├── pdftoppm │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdftoppm │ │ │ └── PDFToPPMRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdftops │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdftops │ │ │ └── PDFToPSRunner.java │ │ └── resources │ │ └── log4j.properties ├── pdftotext │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdftotext │ │ │ └── PDFToTextRunner.java │ │ └── resources │ │ └── log4j.properties ├── polyfile │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── polyfile │ │ │ ├── PolyFile.java │ │ │ └── PolyFilePolyglot.java │ │ └── resources │ │ └── log4j.properties ├── pom.xml ├── qpdf │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── qpdf │ │ │ └── QPDFToJson.java │ │ └── resources │ │ └── log4j.properties ├── tika-client │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── tika │ │ │ └── client │ │ │ ├── TikaClient.java │ │ │ └── TikaLoadTester.java │ │ └── resources │ │ └── log4j2.xml ├── tika │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── tika │ │ │ └── TikaBatch.java │ │ └── resources │ │ └── log4j2.xml └── xpdffonts │ ├── Dockerfile │ ├── pom.xml │ ├── src │ └── main │ │ ├── java │ │ └── org │ │ │ └── tallison │ │ │ └── fileutils │ │ │ └── pdffonts │ │ │ └── PDFFontsRunner.java │ │ └── resources │ │ └── log4j.properties │ ├── tgzs │ ├── xpdf-arabic.tar.gz │ ├── xpdf-arabic │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-chinese-simplified.tar.gz │ ├── xpdf-chinese-simplified │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-chinese-traditional.tar.gz │ ├── xpdf-chinese-traditional │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-cyrillic.tar.gz │ ├── xpdf-cyrillic │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-greek.tar.gz │ ├── xpdf-greek │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-hebrew.tar.gz │ ├── xpdf-hebrew │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-japanese.tar.gz │ ├── xpdf-japanese │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-korean.tar.gz │ ├── xpdf-korean │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-latin2.tar.gz │ ├── xpdf-latin2 │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-t1fonts.tar.gz │ ├── xpdf-t1fonts │ │ ├── COPYING │ │ ├── README │ │ ├── d050000l.pfb │ │ └── s050000l.pfb │ ├── xpdf-thai.tar.gz │ ├── xpdf-thai │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf-turkish.tar.gz │ └── xpdf-turkish │ │ ├── README │ │ └── add-to-xpdfrc │ ├── xpdf │ ├── arabic │ │ └── ISO-8859-6.unicodeMap │ ├── chinese-simplified │ │ ├── Adobe-GB1.cidToUnicode │ │ ├── CMap │ │ │ ├── Adobe-GB1-0 │ │ │ ├── Adobe-GB1-1 │ │ │ ├── Adobe-GB1-2 │ │ │ ├── Adobe-GB1-3 │ │ │ ├── Adobe-GB1-4 │ │ │ ├── Adobe-GB1-5 │ │ │ ├── Adobe-GB1-UCS2 │ │ │ ├── GB-EUC-H │ │ │ ├── GB-EUC-V │ │ │ ├── GB-H │ │ │ ├── GB-V │ │ │ ├── GBK-EUC-H │ │ │ ├── GBK-EUC-UCS2 │ │ │ ├── GBK-EUC-V │ │ │ ├── GBK2K-H │ │ │ ├── GBK2K-V │ │ │ ├── GBKp-EUC-H │ │ │ ├── GBKp-EUC-V │ │ │ ├── GBT-EUC-H │ │ │ ├── GBT-EUC-V │ │ │ ├── GBT-H │ │ │ ├── GBT-V │ │ │ ├── GBTpc-EUC-H │ │ │ ├── GBTpc-EUC-V │ │ │ ├── GBpc-EUC-H │ │ │ ├── GBpc-EUC-UCS2 │ │ │ ├── GBpc-EUC-UCS2C │ │ │ ├── GBpc-EUC-V │ │ │ ├── LICENSE.md │ │ │ ├── UniGB-UCS2-H │ │ │ ├── UniGB-UCS2-V │ │ │ ├── UniGB-UTF16-H │ │ │ ├── UniGB-UTF16-V │ │ │ ├── UniGB-UTF32-H │ │ │ ├── UniGB-UTF32-V │ │ │ ├── UniGB-UTF8-H │ │ │ └── UniGB-UTF8-V │ │ ├── EUC-CN.unicodeMap │ │ ├── GBK.unicodeMap │ │ └── ISO-2022-CN.unicodeMap │ ├── chinese-traditional │ │ ├── Adobe-CNS1.cidToUnicode │ │ ├── Big5.unicodeMap │ │ ├── Big5ascii.unicodeMap │ │ └── CMap │ │ │ ├── Adobe-CNS1-0 │ │ │ ├── Adobe-CNS1-1 │ │ │ ├── Adobe-CNS1-2 │ │ │ ├── Adobe-CNS1-3 │ │ │ ├── Adobe-CNS1-4 │ │ │ ├── Adobe-CNS1-5 │ │ │ ├── Adobe-CNS1-6 │ │ │ ├── Adobe-CNS1-7 │ │ │ ├── Adobe-CNS1-UCS2 │ │ │ ├── B5-H │ │ │ ├── B5-V │ │ │ ├── B5pc-H │ │ │ ├── B5pc-UCS2 │ │ │ ├── B5pc-UCS2C │ │ │ ├── B5pc-V │ │ │ ├── CNS-EUC-H │ │ │ ├── CNS-EUC-V │ │ │ ├── CNS1-H │ │ │ ├── CNS1-V │ │ │ ├── CNS2-H │ │ │ ├── CNS2-V │ │ │ ├── ETHK-B5-H │ │ │ ├── ETHK-B5-V │ │ │ ├── ETen-B5-H │ │ │ ├── ETen-B5-UCS2 │ │ │ ├── ETen-B5-V │ │ │ ├── ETenms-B5-H │ │ │ ├── ETenms-B5-V │ │ │ ├── HKdla-B5-H │ │ │ ├── HKdla-B5-V │ │ │ ├── HKdlb-B5-H │ │ │ ├── HKdlb-B5-V │ │ │ ├── HKgccs-B5-H │ │ │ ├── HKgccs-B5-V │ │ │ ├── HKm314-B5-H │ │ │ ├── HKm314-B5-V │ │ │ ├── HKm471-B5-H │ │ │ ├── HKm471-B5-V │ │ │ ├── HKscs-B5-H │ │ │ ├── HKscs-B5-V │ │ │ ├── LICENSE.md │ │ │ ├── UniCNS-UCS2-H │ │ │ ├── UniCNS-UCS2-V │ │ │ ├── UniCNS-UTF16-H │ │ │ ├── UniCNS-UTF16-V │ │ │ ├── UniCNS-UTF32-H │ │ │ ├── UniCNS-UTF32-V │ │ │ ├── UniCNS-UTF8-H │ │ │ └── UniCNS-UTF8-V │ ├── cyrillic │ │ ├── Bulgarian.nameToUnicode │ │ └── KOI8-R.unicodeMap │ ├── greek │ │ ├── Greek.nameToUnicode │ │ └── ISO-8859-7.unicodeMap │ ├── hebrew │ │ ├── ISO-8859-8.unicodeMap │ │ └── Windows-1255.unicodeMap │ ├── japanese │ │ ├── Adobe-Japan1.cidToUnicode │ │ ├── CMap │ │ │ ├── 78-EUC-H │ │ │ ├── 78-EUC-V │ │ │ ├── 78-H │ │ │ ├── 78-RKSJ-H │ │ │ ├── 78-RKSJ-V │ │ │ ├── 78-V │ │ │ ├── 78ms-RKSJ-H │ │ │ ├── 78ms-RKSJ-V │ │ │ ├── 83pv-RKSJ-H │ │ │ ├── 90ms-RKSJ-H │ │ │ ├── 90ms-RKSJ-UCS2 │ │ │ ├── 90ms-RKSJ-V │ │ │ ├── 90msp-RKSJ-H │ │ │ ├── 90msp-RKSJ-V │ │ │ ├── 90pv-RKSJ-H │ │ │ ├── 90pv-RKSJ-UCS2 │ │ │ ├── 90pv-RKSJ-UCS2C │ │ │ ├── 90pv-RKSJ-V │ │ │ ├── Add-H │ │ │ ├── Add-RKSJ-H │ │ │ ├── Add-RKSJ-V │ │ │ ├── Add-V │ │ │ ├── Adobe-Japan1-0 │ │ │ ├── Adobe-Japan1-1 │ │ │ ├── Adobe-Japan1-2 │ │ │ ├── Adobe-Japan1-3 │ │ │ ├── Adobe-Japan1-4 │ │ │ ├── Adobe-Japan1-5 │ │ │ ├── Adobe-Japan1-6 │ │ │ ├── Adobe-Japan1-7 │ │ │ ├── Adobe-Japan1-UCS2 │ │ │ ├── EUC-H │ │ │ ├── EUC-V │ │ │ ├── Ext-H │ │ │ ├── Ext-RKSJ-H │ │ │ ├── Ext-RKSJ-V │ │ │ ├── Ext-V │ │ │ ├── H │ │ │ ├── Hankaku │ │ │ ├── Hiragana │ │ │ ├── Katakana │ │ │ ├── LICENSE.md │ │ │ ├── NWP-H │ │ │ ├── NWP-V │ │ │ ├── RKSJ-H │ │ │ ├── RKSJ-V │ │ │ ├── Roman │ │ │ ├── UniJIS-UCS2-H │ │ │ ├── UniJIS-UCS2-HW-H │ │ │ ├── UniJIS-UCS2-HW-V │ │ │ ├── UniJIS-UCS2-V │ │ │ ├── UniJIS-UTF16-H │ │ │ ├── UniJIS-UTF16-V │ │ │ ├── UniJIS-UTF32-H │ │ │ ├── UniJIS-UTF32-V │ │ │ ├── UniJIS-UTF8-H │ │ │ ├── UniJIS-UTF8-V │ │ │ ├── UniJIS2004-UTF16-H │ │ │ ├── UniJIS2004-UTF16-V │ │ │ ├── UniJIS2004-UTF32-H │ │ │ ├── UniJIS2004-UTF32-V │ │ │ ├── UniJIS2004-UTF8-H │ │ │ ├── UniJIS2004-UTF8-V │ │ │ ├── UniJISPro-UCS2-HW-V │ │ │ ├── UniJISPro-UCS2-V │ │ │ ├── UniJISPro-UTF8-V │ │ │ ├── UniJISX0213-UTF32-H │ │ │ ├── UniJISX0213-UTF32-V │ │ │ ├── UniJISX02132004-UTF32-H │ │ │ ├── UniJISX02132004-UTF32-V │ │ │ ├── V │ │ │ └── WP-Symbol │ │ ├── EUC-JP.unicodeMap │ │ ├── ISO-2022-JP.unicodeMap │ │ └── Shift-JIS.unicodeMap │ ├── korean │ │ ├── Adobe-KR.cidToUnicode │ │ ├── Adobe-Korea1.cidToUnicode │ │ ├── CMap │ │ │ ├── Adobe-KR-0 │ │ │ ├── Adobe-KR-1 │ │ │ ├── Adobe-KR-2 │ │ │ ├── Adobe-KR-3 │ │ │ ├── Adobe-KR-4 │ │ │ ├── Adobe-KR-5 │ │ │ ├── Adobe-KR-6 │ │ │ ├── Adobe-KR-7 │ │ │ ├── Adobe-KR-8 │ │ │ ├── Adobe-KR-9 │ │ │ ├── Adobe-Korea1-0 │ │ │ ├── Adobe-Korea1-1 │ │ │ ├── Adobe-Korea1-2 │ │ │ ├── Adobe-Korea1-UCS2 │ │ │ ├── KSC-EUC-H │ │ │ ├── KSC-EUC-V │ │ │ ├── KSC-H │ │ │ ├── KSC-Johab-H │ │ │ ├── KSC-Johab-V │ │ │ ├── KSC-V │ │ │ ├── KSCms-UHC-H │ │ │ ├── KSCms-UHC-HW-H │ │ │ ├── KSCms-UHC-HW-V │ │ │ ├── KSCms-UHC-UCS2 │ │ │ ├── KSCms-UHC-V │ │ │ ├── KSCpc-EUC-H │ │ │ ├── KSCpc-EUC-UCS2 │ │ │ ├── KSCpc-EUC-UCS2C │ │ │ ├── KSCpc-EUC-V │ │ │ ├── LICENSE.md │ │ │ ├── UniAKR-UTF16-H │ │ │ ├── UniAKR-UTF32-H │ │ │ ├── UniAKR-UTF8-H │ │ │ ├── UniKS-UCS2-H │ │ │ ├── UniKS-UCS2-V │ │ │ ├── UniKS-UTF16-H │ │ │ ├── UniKS-UTF16-V │ │ │ ├── UniKS-UTF32-H │ │ │ ├── UniKS-UTF32-V │ │ │ ├── UniKS-UTF8-H │ │ │ └── UniKS-UTF8-V │ │ └── ISO-2022-KR.unicodeMap │ ├── latin2 │ │ └── Latin2.unicodeMap │ ├── thai │ │ ├── TIS-620.unicodeMap │ │ └── Thai.nameToUnicode │ └── turkish │ │ └── ISO-8859-9.unicodeMap │ └── xpdfrc └── utils-general ├── pom.xml └── src ├── main └── java │ └── org │ └── tallison │ ├── db │ ├── CustomCSVToPG.java │ ├── ExtractsToDB.java │ ├── FetchFilesFromDBPaths.java │ └── PGToCSV.java │ ├── digest │ ├── CSVLineCounter.java │ ├── CompareLists.java │ ├── DigestChecker.java │ ├── FileListNormalizer.java │ ├── RemoveExtras.java │ ├── S3Compare.java │ ├── S3DigestChecker.java │ └── S3ListCompare.java │ ├── filter │ ├── CopyByMime.java │ └── CopyFilterDigest.java │ └── pdf │ └── utils │ └── PDFSplitter.java └── test └── java └── org └── tallison └── pdf └── utils └── TestPDFSplitter.java /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | *.iml 4 | /tool-runners/pdfchecker/pdf-checker.tgz 5 | /tool-runners/arlington 6 | /tool-runners/arlington/grammar/ 7 | /tika-containers/tika-pdfchecker/pdf-checker.tgz 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # File Observatory 2 | This repo hosts development code used on the backend to support data ingestion into 3 | an ElasticSearch index for the [SafeDocs File Observatory app](https://github.com/jpl-safedocs). 4 | 5 | This repo contains pre-ALPHA grade code for demonstration purposes only. 6 | 7 | Some capabilities demonstrated within have been integrated into Apache Tika. 8 | Some have been spun off into standalone projects, e.g. [commoncrawl-fetcher-lite](https://github.com/tballison/commoncrawl-fetcher-lite). 9 | 10 | # Attribution 11 | The commoncrawl-fetcher module includes code that relies on GeoLite2 data created by MaxMind, available from 12 | [https://www.maxmind.com](https://www.maxmind.com). -------------------------------------------------------------------------------- /batchlite/src/main/java/org/tallison/batchlite/writer/JSONMetadataWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.batchlite.writer; 18 | 19 | import com.google.gson.Gson; 20 | import org.tallison.batchlite.MetadataWriter; 21 | 22 | import java.io.IOException; 23 | import java.nio.charset.StandardCharsets; 24 | import java.nio.file.Files; 25 | import java.nio.file.Path; 26 | 27 | public class JSONMetadataWriter extends MetadataWriter { 28 | 29 | private final static Gson GSON = new Gson(); 30 | 31 | private final Path metadataRootDir; 32 | 33 | public JSONMetadataWriter(String name, 34 | Path metadataRootDir, int stdoutLimit, int stderrLimit) { 35 | super(name, stdoutLimit, stderrLimit); 36 | this.metadataRootDir = metadataRootDir; 37 | } 38 | 39 | @Override 40 | protected void write(PathResultPair pair) throws IOException { 41 | Path target = metadataRootDir.resolve(pair.getRelPath() + ".json"); 42 | if (! Files.isDirectory(target.getParent())) { 43 | Files.createDirectories(target.getParent()); 44 | } 45 | Files.write(target, GSON.toJson(pair.getResult()).getBytes(StandardCharsets.UTF_8)); 46 | } 47 | 48 | @Override 49 | public void close() throws IOException { 50 | //no-op 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /batchlite/src/main/java/org/tallison/batchlite/writer/MetadataWriterFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.batchlite.writer; 18 | 19 | import org.tallison.batchlite.MetadataWriter; 20 | 21 | import java.io.IOException; 22 | import java.nio.file.Paths; 23 | 24 | public class MetadataWriterFactory { 25 | 26 | public static MetadataWriter build(String name, String writerString, 27 | boolean isDelta, 28 | int maxStdout, int maxStderr) throws IOException { 29 | if (writerString.startsWith("jdbc:")) { 30 | return new JDBCMetadataWriter(name, writerString, isDelta, maxStdout, maxStderr); 31 | } else if (writerString.endsWith(".csv") || writerString.endsWith(".tsv")) { 32 | return new CSVMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr); 33 | } else { 34 | return new JSONMetadataWriter(name, Paths.get(writerString), maxStdout, maxStderr); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /batchlite/src/main/java/org/tallison/batchlite/writer/PathResultPair.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.batchlite.writer; 18 | 19 | import org.tallison.batchlite.FileProcessResult; 20 | 21 | import java.nio.file.Path; 22 | 23 | public class PathResultPair { 24 | 25 | 26 | 27 | private final String relPath; 28 | private final FileProcessResult result; 29 | 30 | public PathResultPair(String relPath, FileProcessResult result) { 31 | this.relPath = relPath; 32 | this.result = result; 33 | } 34 | 35 | public String getRelPath() { 36 | return relPath; 37 | } 38 | 39 | public FileProcessResult getResult() { 40 | return result; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return "PathResultPair{" + 46 | "relPath='" + relPath + '\'' + 47 | ", result=" + result + 48 | '}'; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /batchlite/src/main/java/org/tallison/batchlite/writer/WriterResult.java: -------------------------------------------------------------------------------- 1 | package org.tallison.batchlite.writer; 2 | 3 | public class WriterResult { 4 | 5 | private final int recordsWritten; 6 | public WriterResult(int recordsWritten) { 7 | this.recordsWritten = recordsWritten; 8 | } 9 | public int getRecordsWritten() { 10 | return recordsWritten; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /batchlite/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/README.txt: -------------------------------------------------------------------------------- 1 | This is a set of utilities for extracting files from Common Crawl. 2 | 3 | The assumption is that you don't have direct access to S3 and you 4 | need to pull data. 5 | 6 | Step 1: 7 | * Download the 300 index .gz files (this is normally ~1 TB) 8 | 9 | Step 2: 10 | * Read through the .gz files and index into postgres those files that 11 | meet certain criteria (maybe just PDFs, etc) 12 | 13 | Step 3: 14 | * Based on the records in the database, request the warc file from AWS for 15 | each file 16 | * Extract the literal bytes from that file and index some more data from the warc 17 | 18 | Step 4: 19 | * For each file that CC identified as truncated, go back to the original URL and try 20 | to retrieve the file from there. -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/java/org/tallison/cc/CCIndexReaderCounter.java: -------------------------------------------------------------------------------- 1 | package org.tallison.cc; 2 | 3 | import java.util.concurrent.atomic.AtomicLong; 4 | 5 | public class CCIndexReaderCounter { 6 | AtomicLong recordsRead = new AtomicLong(0); 7 | AtomicLong filesExtracted = new AtomicLong(0); 8 | AtomicLong truncatedWritten = new AtomicLong(0); 9 | 10 | public AtomicLong getRecordsRead() { 11 | return recordsRead; 12 | } 13 | 14 | public AtomicLong getFilesExtracted() { 15 | return filesExtracted; 16 | } 17 | 18 | public AtomicLong getTruncatedWritten() { 19 | return truncatedWritten; 20 | } 21 | 22 | @Override 23 | public String toString() { 24 | return "CCIndexReaderCounter{" + 25 | "recordsRead=" + recordsRead + 26 | ", filesExtracted=" + filesExtracted + 27 | ", truncatedWritten=" + truncatedWritten + 28 | '}'; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/java/org/tallison/cc/index/IndexRecordProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.tallison.cc.index; 19 | 20 | 21 | import java.io.IOException; 22 | 23 | public interface IndexRecordProcessor { 24 | 25 | public void init(String[] args) throws Exception; 26 | 27 | public boolean process(String json) throws IOException; 28 | 29 | public void close() throws IOException; 30 | } 31 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/java/org/tallison/cc/index/RecordFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.cc.index; 18 | 19 | public interface RecordFilter { 20 | 21 | boolean accept(CCIndexRecord record); 22 | } 23 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/java/org/tallison/util/MapUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.util; 18 | 19 | import java.util.Collections; 20 | import java.util.Comparator; 21 | import java.util.LinkedHashMap; 22 | import java.util.LinkedList; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | public class MapUtil { 27 | public static , 28 | V extends Comparable> Map sortByDescendingValue(Map map ) { 29 | List> list = 30 | new LinkedList<>( map.entrySet() ); 31 | Collections.sort( list, new Comparator>() { 32 | @Override 33 | public int compare(Map.Entry o1, Map.Entry o2 ) 34 | { 35 | int c = o2.getValue().compareTo(o1.getValue()); 36 | if (c == 0) { 37 | return o1.getKey().compareTo(o2.getKey()); 38 | } 39 | return c; 40 | } 41 | } ); 42 | 43 | Map result = new LinkedHashMap<>(); 44 | for (Map.Entry entry : list) 45 | { 46 | result.put( entry.getKey(), entry.getValue() ); 47 | } 48 | return result; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/java/org/tallison/util/ReloadFetchStatusTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.tallison.util; 18 | 19 | import java.sql.Connection; 20 | import java.sql.DriverManager; 21 | import java.sql.Statement; 22 | 23 | import org.tallison.cc.CCFileFetcher; 24 | 25 | /** 26 | * For dev use only. This loads a new status table for when there are changes 27 | * to CCFileFetcher.STATUS 28 | */ 29 | public class ReloadFetchStatusTable { 30 | 31 | public static void main(String[] args) throws Exception { 32 | Connection connection = DriverManager.getConnection(args[0]); 33 | try (Statement st = connection.createStatement()) { 34 | String sql = "drop table if exists cc_fetch_status"; 35 | st.execute(sql); 36 | 37 | sql = "create table cc_fetch_status " + "(id integer primary key, status varchar(64));"; 38 | st.execute(sql); 39 | 40 | 41 | for (CCFileFetcher.FETCH_STATUS status : CCFileFetcher.FETCH_STATUS.values()) { 42 | 43 | sql = "insert into cc_fetch_status values (" + status.ordinal() + ",'" + 44 | status.name() + "');"; 45 | st.execute(sql); 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectFetchAndFetchStatus.sql: -------------------------------------------------------------------------------- 1 | select f.id, f.fetched_digest, f.fetched_length, s.status 2 | from cc_fetch f 3 | join cc_fetch_status s on f.status_id=s.id -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectFilesToFetchFromCC.sql: -------------------------------------------------------------------------------- 1 | --limited query used to pull enough info back to 2 | --extract the right files from common crawl's warc files 3 | select u.id, 4 | digest as cc_index_digest, 5 | w.name as warc_file_name, 6 | warc_offset, warc_length 7 | from cc_urls u 8 | join cc_warc_file_name w on u.warc_file_name = w.id 9 | join cc_truncated t on u.truncated = t.id 10 | left join cc_fetch f on f.id = u.id 11 | where f.id is null and u.status = 200 and length(t.name) = 0 12 | order by w.name, warc_offset -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectFilesToFetchPerWarcId.sql: -------------------------------------------------------------------------------- 1 | select u.id, 2 | digest as cc_index_digest, 3 | w.name as warc_file_name, 4 | warc_offset, warc_length, 5 | t.name as cc_truncated 6 | from cc_urls u 7 | join cc_warc_file_name w on u.warc_file_name = w.id 8 | join cc_truncated t on u.truncated = t.id 9 | left join cc_fetch f on f.id = u.id 10 | where f.id is null and u.status = 200 11 | order by w.id, warc_offset -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectIndexedAndFetchedData.sql: -------------------------------------------------------------------------------- 1 | --full query of the useful information gathered 2 | --from the indices 3 | select u.id, url, 4 | digest as cc_index_digest, 5 | f.fetched_digest, 6 | u.status as http_status, 7 | m.name as mime, 8 | dm.name as detected_mime, 9 | t.name as truncated, 10 | w.name as warc_file_name, 11 | warc_offset, warc_length, 12 | l.name as languages, 13 | f.fetched_length, 14 | s.status as fetched_status 15 | from cc_urls u 16 | join cc_warc_file_name w on u.warc_file_name = w.id 17 | join cc_mimes m on u.mime = m.id 18 | join cc_detected_mimes dm on u.detected_mime=dm.id 19 | join cc_truncated t on u.truncated = t.id 20 | join cc_languages l on u.languages = l.id 21 | left join cc_fetch f on f.id=u.id 22 | left join cc_fetch_status s on f.status_id=s.id 23 | where u.status = 200 and length(t.name) = 0 24 | order by w.name, warc_offset -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectIndexedData.sql: -------------------------------------------------------------------------------- 1 | --full query of the useful information gathered 2 | --from the indices 3 | select u.id, url, 4 | digest as cc_index_digest, 5 | status as http_status, 6 | m.name as mime, 7 | dm.name as detected_mime, 8 | t.name as truncated, 9 | w.name as warc_file_name, 10 | warc_offset, warc_length, 11 | l.name as languages 12 | from cc_urls u 13 | join cc_warc_file_name w on u.warc_file_name = w.id 14 | join cc_mimes m on u.mime = m.id 15 | join cc_detected_mimes dm on u.detected_mime=dm.id 16 | join cc_truncated t on u.truncated = t.id 17 | join cc_languages l on u.languages = l.id 18 | where status = 200 and length(t.name) = 0 19 | order by w.name, warc_offset -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/main/resources/selectWarcFileIdsToFetchFromCC.sql: -------------------------------------------------------------------------------- 1 | select w.id 2 | from cc_warc_file_name w 3 | order by w.id -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/examples/mpeg-filters.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": [200,300,400], 3 | "exact" : { 4 | "detected_mimes": [ 5 | "video/mp4", 6 | "video/quicktime" 7 | ], 8 | "case_sensitive" : false 9 | } 10 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/examples/tika-config-fetch-fs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | id 5 | warc_file_name 6 | warc_offset 7 | warc_end_offset 8 | hf 9 | fse 10 | jdbc:sqlite:/Users/allison/Desktop/demo-backup.db 11 | 13 | 16 | 31 | 32 | 33 | 34 | 35 | 36 | hf 37 | 38 | 39 | 40 | 41 | 42 | 43 | fse 44 | /Users/allison/data/cc/docs 45 | skip 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/examples/tika-config-index-fs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | fs1 5 | /Users/allison/data/cc/CC-MAIN-2022-27 6 | 7 | 8 | 9 | 10 | 11 | fs1 12 | /Users/allison/data/cc/CC-MAIN-2022-27 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/examples/tika-config-index-s3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | fs1 5 | 6 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-27/cc-index.paths.gz 7 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-21/cc-index.paths.gz 8 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz 9 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-49/cc-index.paths.gz 10 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-43/cc-index.paths.gz 11 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-39/cc-index.paths.gz 12 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-31/cc-index.paths.gz 13 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-25/cc-index.paths.gz 14 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-21/cc-index.paths.gz 15 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-17/cc-index.paths.gz 16 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/cc-index.paths.gz 17 | https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/cc-index.paths.gz 18 | 19 | 20 | 21 | 22 | 23 | 24 | fs1 25 | commoncrawl 26 | profile 27 | saml-pub 28 | us-east-1 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/examples/tika-config-refetch-fs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hf 6 | 10 7 | 8 | 10000000000 9 | 300000 10 | 11 | 12 | 13 | 14 | 15 | 16 | fse 17 | /Users/allison/data/cc/docs/CC-MAIN-2022-27 18 | skip 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/mime-filters-av.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": [200,300,400], 3 | "regex" : { 4 | "mimes": [ 5 | "\\Aaudio", 6 | "\\Avideo" 7 | ], 8 | "detected_mimes": [ 9 | "\\Aaudio", 10 | "\\Avideo" 11 | ] 12 | } 13 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/mime-filters.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": [200,300,400], 3 | "exact" : { 4 | "mimes": [ 5 | "application/pdf" 6 | ], 7 | "detected_mimes": [ 8 | "application/pdf" 9 | ], 10 | "case_sensitive" : false 11 | }, 12 | "regex" : { 13 | "mimes": [ 14 | "(?i)pdf\\Z" 15 | ], 16 | "detected_mimes": [ 17 | "(?i)pdf\\Z" 18 | ] 19 | } 20 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter-sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "exact" : { 3 | "mimes": [ 4 | {"pattern": "application/pdf", "probability": 0.1} 5 | ], 6 | "detected_mimes": [ 7 | {"pattern": "application/pdf", "probability": 0.1} 8 | ], 9 | "case_sensitive" : false 10 | }, 11 | "status": 200 12 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/pdf-filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "exact" : { 3 | "mimes": [ 4 | "application/pdf" 5 | ], 6 | "detected_mimes": [ 7 | "application/pdf" 8 | ], 9 | "case_sensitive" : false 10 | }, 11 | "defaultInclude": false 12 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/status-filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": 200 3 | } -------------------------------------------------------------------------------- /commoncrawl-fetcher/src/test/resources/test-documents/status-sample-filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": 200, 3 | "probability": 0.001 4 | } -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/CompositeFeatureMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest; 2 | 3 | import org.apache.tika.config.ServiceLoader; 4 | import org.apache.tika.pipes.fetcher.Fetcher; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | public class CompositeFeatureMapper implements FeatureMapper { 14 | private static final ServiceLoader DEFAULT_LOADER = 15 | new ServiceLoader(FeatureMapper.class.getClassLoader()); 16 | 17 | List mappers; 18 | 19 | public CompositeFeatureMapper() { 20 | this(DEFAULT_LOADER.loadServiceProviders(FeatureMapper.class)); 21 | } 22 | 23 | public CompositeFeatureMapper(List mappers) { 24 | this.mappers = mappers; 25 | } 26 | 27 | @Override 28 | public void addFeatures(Map row, Fetcher fetcher, 29 | StoredDocument storedDocument) throws SQLException { 30 | for (FeatureMapper mapper : mappers) { 31 | mapper.addFeatures(row, fetcher, storedDocument); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/FeatureMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.quaerite.core.StoredDocument; 5 | 6 | import java.nio.file.Path; 7 | import java.sql.ResultSet; 8 | import java.sql.SQLException; 9 | import java.util.Map; 10 | 11 | public interface FeatureMapper { 12 | 13 | public static final String REL_PATH_KEY = "relpath"; 14 | public static final String ID_KEY = "id"; 15 | void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException; 16 | } 17 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/CPUMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.ingest.FeatureMapper; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.Map; 11 | 12 | /** 13 | * 14 | */ 15 | public class CPUMapper implements FeatureMapper { 16 | 17 | @Override 18 | public void addFeatures(Map row, Fetcher fetcher, 19 | StoredDocument storedDocument) throws SQLException { 20 | String val = row.get("cpu_warn"); 21 | storedDocument.addNonBlankField("cpu_warn", val); 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/CaradocMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.ingest.FeatureMapper; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.Map; 11 | 12 | public class CaradocMapper implements FeatureMapper { 13 | 14 | @Override 15 | public void addFeatures(Map row, Fetcher fetcher, 16 | StoredDocument storedDocument) throws SQLException { 17 | String val = row.get("cd"); 18 | storedDocument.addNonBlankField("cd", val); 19 | val = row.get("cd_warn"); 20 | storedDocument.addNonBlankField("cd_warn", val); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/ClamAVMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | 4 | import org.apache.tika.pipes.fetcher.Fetcher; 5 | import org.tallison.ingest.FeatureMapper; 6 | import org.tallison.quaerite.core.StoredDocument; 7 | 8 | import java.nio.file.Path; 9 | import java.sql.ResultSet; 10 | import java.sql.SQLException; 11 | import java.util.Map; 12 | 13 | public class ClamAVMapper implements FeatureMapper { 14 | 15 | @Override 16 | public void addFeatures(Map row, Fetcher fetcher, 17 | StoredDocument storedDocument) throws SQLException { 18 | String val = row.get("clamav"); 19 | storedDocument.addNonBlankField("clamav", val); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/ESUtil.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | public class ESUtil { 4 | public static String stripIllegalUnicode(String s) { 5 | if (s == null) { 6 | return ""; 7 | } 8 | return s.replaceAll("\u0000", "u0000") 9 | .replaceAll("\u001f", "u001f") 10 | .replaceAll("\u001e", "u001e") 11 | ; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/MutoolMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.ingest.FeatureMapper; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.Map; 11 | 12 | /** 13 | * this should cover both mutool clean -s and mutool text 14 | * we aren't currently indexing text as extrated by mutool text 15 | */ 16 | public class MutoolMapper implements FeatureMapper { 17 | 18 | @Override 19 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException { 20 | String val = row.get("mc_warn"); 21 | storedDocument.addNonBlankField("mc_warn", val); 22 | val = row.get("mt_warn"); 23 | storedDocument.addNonBlankField("mt_warn", val); 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/PDFMinerMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.ingest.FeatureMapper; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.Map; 11 | 12 | /** 13 | * this should cover both pdfminer dump and pdfminer text 14 | * we aren't currently indexing anything but the warning msgs 15 | */ 16 | public class PDFMinerMapper implements FeatureMapper { 17 | 18 | @Override 19 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException { 20 | String val = row.get("pmd_warn"); 21 | storedDocument.addNonBlankField("pmd_warn", val); 22 | val = row.get("pmt_warn"); 23 | storedDocument.addNonBlankField("pmt_warn", val); 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/PDFResurrectMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import java.sql.SQLException; 4 | import java.util.Map; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | import org.apache.tika.pipes.fetcher.Fetcher; 9 | import org.tallison.ingest.FeatureMapper; 10 | import org.tallison.quaerite.core.StoredDocument; 11 | 12 | public class PDFResurrectMapper implements FeatureMapper { 13 | @Override 14 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) 15 | throws SQLException { 16 | String stdout = row.get("pr"); 17 | if (stdout == null) { 18 | return; 19 | } 20 | Matcher m = Pattern.compile(": (\\d+)").matcher(stdout); 21 | if (m.find()) { 22 | storedDocument.addNonBlankField("pr_updates", m.group(1)); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/ProfileFeatureMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.pipes.fetcher.Fetcher; 4 | import org.tallison.ingest.FeatureMapper; 5 | import org.tallison.quaerite.core.StoredDocument; 6 | 7 | import java.nio.file.Path; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.ArrayList; 11 | import java.util.Collections; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import static org.tallison.ingest.mappers.QPDFFeatureMapper.joinWith; 19 | 20 | public class ProfileFeatureMapper implements FeatureMapper { 21 | @Override 22 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) throws SQLException { 23 | 24 | storedDocument.addNonBlankField("fname", row.get("fname")); 25 | storedDocument.addNonBlankField("original_fname", row.get("fname")); 26 | storedDocument.addNonBlankField("shasum_256", row.get("shasum_256")); 27 | storedDocument.addNonBlankField("size", row.get("size")); 28 | storedDocument.addNonBlankField("collection", row.get("collection")); 29 | //these are all commoncrawl/web crawl specific... factor into another mapper? 30 | storedDocument.addNonBlankField("host_location", row.get("host_location")); 31 | storedDocument.addNonBlankField("country", row.get("country")); 32 | storedDocument.addNonBlankField("tld", row.get("tld")); 33 | storedDocument.addNonBlankField("detected_mime", row.get("detected_mime")); 34 | storedDocument.addNonBlankField("url", row.get("url")); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/mappers/UniverseMapper.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import java.sql.SQLException; 4 | import java.util.Map; 5 | 6 | import org.tallison.ingest.FeatureMapper; 7 | import org.tallison.quaerite.core.StoredDocument; 8 | 9 | import org.apache.tika.pipes.fetcher.Fetcher; 10 | 11 | public class UniverseMapper implements FeatureMapper { 12 | @Override 13 | public void addFeatures(Map row, Fetcher fetcher, StoredDocument storedDocument) 14 | throws SQLException { 15 | storedDocument.addNonBlankField("universe", row.get("universe")); 16 | storedDocument.addNonBlankField("universe_validity", 17 | row.get("universe_validity")); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/qpdf/QPDFResults.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.qpdf; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | public class QPDFResults { 7 | 8 | public Set keys = new HashSet<>(); 9 | public Set parentAndKeys = new HashSet<>(); 10 | public Set typeKeys = new HashSet<>(); 11 | public Set keyValues = new HashSet<>(); 12 | public Set filters = new HashSet<>(); 13 | public int maxFilterCount = 0; 14 | 15 | @Override 16 | public String toString() { 17 | return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys + 18 | ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters + 19 | ", maxFilterCount=" + maxFilterCount + '}'; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ingest/src/main/java/org/tallison/ingest/qpdf10/qpdf/QPDFResults.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.qpdf10.qpdf; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | public class QPDFResults { 7 | 8 | public Set keys = new HashSet<>(); 9 | public Set parentAndKeys = new HashSet<>(); 10 | public Set typeKeys = new HashSet<>(); 11 | public Set keyValues = new HashSet<>(); 12 | public Set filters = new HashSet<>(); 13 | public int maxFilterCount = 0; 14 | 15 | @Override 16 | public String toString() { 17 | return "QPDFResults{" + "keys=" + keys + ", parentAndKeys=" + parentAndKeys + 18 | ", typeKeys=" + typeKeys + ", keyValues=" + keyValues + ", filters=" + filters + 19 | ", maxFilterCount=" + maxFilterCount + '}'; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ingest/src/main/resources/META-INF/services/org.tallison.ingest.FeatureMapper: -------------------------------------------------------------------------------- 1 | #org.tallison.ingest.mappers.ArlingtonMapper 2 | #org.tallison.ingest.mappers.CaradocMapper 3 | #org.tallison.ingest.mappers.ClamAVMapper 4 | #org.tallison.ingest.mappers.CPUMapper 5 | #org.tallison.ingest.mappers.MutoolMapper 6 | #org.tallison.ingest.mappers.PDFBytesMapper 7 | #org.tallison.ingest.mappers.PDFCheckerMapper 8 | org.tallison.ingest.mappers.PDFInfoFeatureMapper 9 | #org.tallison.ingest.mappers.PDFMinerMapper 10 | org.tallison.ingest.mappers.ProfileFeatureMapper 11 | org.tallison.ingest.mappers.QPDFFeatureMapper 12 | org.tallison.ingest.mappers.StatusFeatureMapper 13 | #org.tallison.ingest.mappers.TikaFeatureMapper 14 | #org.tallison.ingest.mappers.MultiCompareMapper 15 | #org.tallison.ingest.mappers.PDFResurrectMapper 16 | #org.tallison.ingest.mappers.PDFFontsMapper 17 | #org.tallison.ingest.mappers.XPDFFontsMapper 18 | #org.tallison.ingest.mappers.UniverseMapper -------------------------------------------------------------------------------- /ingest/src/main/resources/important-int-keys.txt: -------------------------------------------------------------------------------- 1 | /BitsPerComponent 2 | /BitsPerCoordinate 3 | /BitsPerSample 4 | /ca 5 | /CA 6 | /Colors 7 | /ColorTransform 8 | /Count 9 | /Descent 10 | /EarlyChange 11 | /F 12 | /Ff 13 | /FL 14 | /FontWeight 15 | /FormType 16 | /FunctionType 17 | /Gamma 18 | /HalftoneType 19 | /I 20 | /LC 21 | /Length 22 | /LJ 23 | /LW 24 | /M 25 | /ML 26 | /N 27 | /O 28 | /OPM 29 | /Order 30 | /P 31 | /PaintType 32 | /PatternType 33 | /Penalty 34 | /Position 35 | /Predictor 36 | /Q 37 | /R 38 | /Rotate 39 | /RT 40 | /S 41 | /ShadingType 42 | /SM 43 | /SMaskInData 44 | /St 45 | /TilingType 46 | /TP 47 | /UserUnit 48 | /V 49 | /Version 50 | /VerticesPerRow 51 | /Volume 52 | /W 53 | /WMode -------------------------------------------------------------------------------- /ingest/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %t %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /ingest/src/main/resources/selectStar-minimal.sql: -------------------------------------------------------------------------------- 1 | select u.id, 2 | u.url as url, 3 | 's3://safedocs-cc-202109/'||p.path as fname, 4 | p.path as relpath, 5 | fetched_digest as shasum_256, 6 | 'CC-MAIN-2021-31' as collection, 7 | fetched_length as size, 8 | case 9 | when latitude is null 10 | then '' 11 | else latitude||','||longitude 12 | end as host_location, 13 | h.tld, h.country, 14 | pinfo.stderr pinfo_stderr, 15 | pinfo.stdout pinfo_stdout, 16 | pinfo.exit_value pinfo_exit, 17 | case 18 | when pinfo.stderr like 'Command Line Error: Incorrect password%' then 'encrypted' 19 | when pinfo.path is null then 'missing' 20 | when pinfo.timeout=true then 'timeout' 21 | when pinfo.exit_value <> 0 then 'crash' 22 | when length(pinfo.stderr) > 5 then 'warn' 23 | else 'success' 24 | end as pinfo_status, 25 | q.stderr q_stderr, 26 | q.exit_value q_exit, 27 | case 28 | when q.path is null then 'missing' 29 | when q.timeout=true then 'timeout' 30 | when q.exit_value <> 0 then 'crash' 31 | when length(q.stderr) > 5 then 'warn' 32 | else 'success' 33 | end as q_status 34 | from profiles p 35 | join cc_fetch f on p.path = f.path 36 | join cc_fetch_status s on f.status_id=s.id 37 | join cc_urls u on f.id=u.id 38 | join cc_hosts h on u.host=h.id 39 | join pdfinfo pinfo on pinfo.path=p.path 40 | join qpdf q on q.path = p.path 41 | order by u.id -------------------------------------------------------------------------------- /ingest/src/main/resources/selectStar-sample.sql: -------------------------------------------------------------------------------- 1 | select u.id as id, 2 | 'CC-MAIN-2021-31-sample' as collection, 3 | case 4 | when m.name is null or length(m.name) = 0 5 | then 'UNKNOWN' 6 | else m.name 7 | end as detected_mime, 8 | case 9 | when latitude is not null 10 | then latitude||','||longitude 11 | else '' 12 | end as host_location, 13 | h.tld, 14 | case 15 | when h.country is not null 16 | then h.country 17 | else 'UNKNOWN' 18 | end as country 19 | from sample.cc_urls u 20 | join sample.cc_hosts h on u.host=h.id 21 | join sample.cc_detected_mimes m on u.detected_mime=m.id 22 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/ArlingtonMapperTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.junit.Test; 4 | import org.tallison.quaerite.core.StoredDocument; 5 | 6 | import java.util.List; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | import static org.junit.Assert.assertTrue; 10 | 11 | public class ArlingtonMapperTest extends MapperTest { 12 | 13 | @Test 14 | public void testBasic() throws Exception { 15 | ArlingtonMapper mapper = new ArlingtonMapper(); 16 | StoredDocument sd = new StoredDocument(""); 17 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-687647-0.pdf.txt"), sd); 18 | assertEquals("Can't select any link", sd.getFields().get("a_warn")); 19 | } 20 | 21 | @Test 22 | public void testFailedToOpen() throws Exception { 23 | ArlingtonMapper mapper = new ArlingtonMapper(); 24 | StoredDocument sd = new StoredDocument(""); 25 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-688076-1.pdf.txt"), sd); 26 | assertEquals("fail", sd.getFields().get("a_status")); 27 | } 28 | 29 | @Test 30 | public void testDiffContexts() throws Exception { 31 | //GHOSTSCRIPT-687499-0.pdf.txt 32 | ArlingtonMapper mapper = new ArlingtonMapper(); 33 | StoredDocument sd = new StoredDocument(""); 34 | mapper._processFile(getPath("arlington/GHOSTSCRIPT-687499-0.pdf.txt"), sd); 35 | boolean success = false; 36 | for (String s : (List)sd.getFields().get("a_warn")) { 37 | if (s.equals("object validated in two different contexts")) { 38 | success = true; 39 | } 40 | } 41 | assertTrue(success); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/MapperTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.net.URISyntaxException; 6 | 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.time.Instant; 10 | import java.time.LocalDateTime; 11 | import java.time.ZoneId; 12 | import java.time.format.DateTimeFormatter; 13 | import java.util.Locale; 14 | 15 | import org.junit.Test; 16 | 17 | public class MapperTest { 18 | 19 | InputStream getPath(String relPath) throws IOException { 20 | try { 21 | String path = "/test-documents/"+relPath; 22 | return Files.newInputStream(Paths.get(this.getClass().getResource(path).toURI())); 23 | } catch (URISyntaxException e) { 24 | throw new IOException(e); 25 | } 26 | } 27 | 28 | @Test 29 | public void testDateParsing() throws Exception { 30 | String v = "Mon Apr 1 22:12:30 2013 UTC"; 31 | v = v.replaceAll("\\s+", " ").trim(); 32 | Instant instant = LocalDateTime.parse(v, 33 | DateTimeFormatter.ofPattern( "EEE MMM d HH:mm:ss yyyy z", 34 | Locale.US ) 35 | ) 36 | .atZone(ZoneId.of("UTC")).toInstant(); 37 | System.out.println(instant); 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/PDFCheckerMapperTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import org.apache.tika.io.TikaInputStream; 4 | import org.junit.Test; 5 | import org.tallison.ingest.mappers.PDFCheckerMapper; 6 | import org.tallison.quaerite.core.StoredDocument; 7 | 8 | import java.io.InputStream; 9 | import java.nio.file.Path; 10 | import java.nio.file.Paths; 11 | 12 | import static org.junit.Assert.assertTrue; 13 | 14 | public class PDFCheckerMapperTest { 15 | 16 | @Test 17 | public void testBasic() throws Exception { 18 | PDFCheckerMapper mapper = new PDFCheckerMapper(); 19 | Path p = Paths.get( 20 | PDFCheckerMapperTest.class.getResource( 21 | "/test-documents/pdfchecker/GHOSTSCRIPT-696838-0.zip-0.pdf.json").toURI()); 22 | StoredDocument sd = new StoredDocument("id"); 23 | try (InputStream is = TikaInputStream.get(p)) { 24 | mapper.processJson(is, sd); 25 | } 26 | String summaryInfo = sd.getFields().get("pc_summary_info").toString(); 27 | assertTrue(summaryInfo.contains("can-be-optimized")); 28 | assertTrue(summaryInfo.contains("born-digital")); 29 | } 30 | 31 | @Test 32 | public void testFonts() throws Exception { 33 | PDFCheckerMapper mapper = new PDFCheckerMapper(); 34 | Path p = Paths.get( 35 | PDFCheckerMapperTest.class.getResource( 36 | "/test-documents/pdfchecker/fonts-PDFBOX-1002-2.pdf.json").toURI()); 37 | StoredDocument sd = new StoredDocument("id"); 38 | try (InputStream is = TikaInputStream.get(p)) { 39 | mapper.processJson(is, sd); 40 | } 41 | String summaryInfo = sd.getFields().get("pc_summary_info").toString(); 42 | assertTrue(summaryInfo.contains("can-be-optimized")); 43 | assertTrue(summaryInfo.contains("born-digital")); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/PDFFontsMapperTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | 4 | import java.nio.charset.StandardCharsets; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import org.apache.commons.io.IOUtils; 9 | import org.junit.Test; 10 | import org.tallison.quaerite.core.StoredDocument; 11 | 12 | public class PDFFontsMapperTest extends MapperTest { 13 | 14 | @Test 15 | public void testBasic() throws Exception { 16 | String stdout = IOUtils.toString( 17 | getPath("pdffonts/test-basic.txt"), StandardCharsets.UTF_8); 18 | 19 | PDFFontsMapper mapper = new PDFFontsMapper(); 20 | StoredDocument sd = new StoredDocument("id"); 21 | Map row = new HashMap<>(); 22 | row.put("pdffonts_stdout", stdout); 23 | mapper.addFeatures(row, null, sd); 24 | System.out.println(sd); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/QPDFJsonExtractorTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import java.io.IOException; 6 | import java.io.Reader; 7 | import java.net.URISyntaxException; 8 | import java.nio.charset.StandardCharsets; 9 | import java.nio.file.Files; 10 | import java.nio.file.Path; 11 | import java.nio.file.Paths; 12 | 13 | import org.junit.Test; 14 | import org.tallison.ingest.qpdf.QPDFJsonExtractor; 15 | import org.tallison.ingest.qpdf.QPDFResults; 16 | 17 | //these are tests for qpdf 11.x json v2 18 | public class QPDFJsonExtractorTest { 19 | 20 | @Test 21 | public void testBasic() throws Exception { 22 | try (Reader reader = getReader("/qpdfv11/qpdf.json")) { 23 | QPDFJsonExtractor ex = new QPDFJsonExtractor(); 24 | QPDFResults results = ex.extract("id", reader); 25 | System.out.println(results); 26 | assertTrue(results.keyValues.contains("/Creator->Microsoft® Office Word 2007")); 27 | assertTrue(results.keyValues.contains(("/CreationDate->DATE"))); 28 | } 29 | } 30 | 31 | private Reader getReader(String file) throws IOException { 32 | return Files.newBufferedReader(getPath(file), StandardCharsets.UTF_8); 33 | } 34 | 35 | private Path getPath(String file) throws IOException { 36 | try { 37 | return Paths.get(this.getClass().getResource("/test-documents/"+file).toURI()); 38 | } catch (URISyntaxException e) { 39 | throw new IOException(e); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /ingest/src/test/java/org/tallison/ingest/mappers/XPDFFontsMapperTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingest.mappers; 2 | 3 | 4 | import java.nio.charset.StandardCharsets; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import org.apache.commons.io.IOUtils; 9 | import org.junit.Test; 10 | import org.tallison.quaerite.core.StoredDocument; 11 | 12 | public class XPDFFontsMapperTest extends MapperTest { 13 | 14 | @Test 15 | public void testBasic() throws Exception { 16 | String stdout = IOUtils.toString( 17 | getPath("xpdffonts/test-basic.txt"), StandardCharsets.UTF_8); 18 | 19 | XPDFFontsMapper mapper = new XPDFFontsMapper(); 20 | StoredDocument sd = new StoredDocument("id"); 21 | Map row = new HashMap<>(); 22 | row.put("xpdffonts_stdout", stdout); 23 | mapper.addFeatures(row, null, sd); 24 | System.out.println(sd); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-687647-0.pdf.txt: -------------------------------------------------------------------------------- 1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-687647-0.pdf" - PDFix v6.1.0 2 | Trailer 3 | Trailer->Root 4 | Trailer->Info 5 | Trailer->Root->Pages 6 | Trailer->Root->Outlines 7 | Trailer->Root->Pages->Kids 8 | Error: Can't select any link from [fn:SinceVersion(1.0,PageTreeNode),fn:SinceVersion(1.0,PageObject)] to validate provided object: [0] for object 4 9 | END 10 | -------------------------------------------------------------------------------- /ingest/src/test/resources/test-documents/arlington/GHOSTSCRIPT-688076-1.pdf.txt: -------------------------------------------------------------------------------- 1 | BEGIN - TestGrammar v0.4 built Dec 17 2020 22:39:13 - "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix v6.1.0 2 | Error: Failed to open: "/input/bugtrackers/GHOSTSCRIPT/GHOSTSCRIPT-688076-1.pdf" - PDFix GetError(): Failed to open document. 3 | END 4 | -------------------------------------------------------------------------------- /simple-ingester/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | file-observatory 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | simple-ingester 13 | 14 | 15 | 11 16 | 11 17 | 18 | 19 | 20 | 21 | org.apache.tika 22 | tika-core 23 | 24 | 25 | org.apache.tika 26 | tika-serialization 27 | ${tika.version} 28 | 29 | 30 | org.apache.httpcomponents 31 | httpclient 32 | 4.5.13 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /simple-ingester/src/main/java/org/tallison/ingester/IngesterCLI.java: -------------------------------------------------------------------------------- 1 | package org.tallison.ingester; 2 | 3 | public class IngesterCLI { 4 | 5 | public static void main(String[] args) { 6 | 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /simple-ingester/src/main/java/org/tallison/tika/parser/ConcatenatingParser.java: -------------------------------------------------------------------------------- 1 | package org.tallison.tika.parser; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.tika.metadata.Metadata; 7 | import org.apache.tika.pipes.FetchEmitTuple; 8 | 9 | public class ConcatenatingParser { 10 | 11 | private List parsers = new ArrayList<>(); 12 | 13 | public List parse(FetchEmitTuple tuple) { 14 | List results = new ArrayList<>(); 15 | 16 | return results; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tika-addons/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | file-observatory 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-addons 13 | pom 14 | 15 | tika-pipes-reporter 16 | tika-eval-multicomparer 17 | tika-server-fuzzer 18 | 19 | 20 | 21 | 11 22 | 11 23 | 24 | 25 | -------------------------------------------------------------------------------- /tika-addons/tika-eval-multicomparer/src/main/java/org/tallison/tika/eval/multi/ListGenerator.java: -------------------------------------------------------------------------------- 1 | package org.tallison.tika.eval.multi; 2 | 3 | import java.io.File; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | public class ListGenerator { 8 | 9 | public static void main(String[] args) throws Exception { 10 | Set seen = new HashSet<>(); 11 | File tools = new File(".../data/extracts"); 12 | for (File tool : tools.listFiles()) { 13 | for (File c : tool.listFiles()) { 14 | for (File e : c.listFiles()) { 15 | String n = e.getName().replaceAll(".json", "").replaceAll(".txt", ""); 16 | if (! n.startsWith("._")) { 17 | seen.add(n); 18 | } 19 | } 20 | } 21 | } 22 | for (String n : seen) { 23 | System.out.println(n); 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tika-addons/tika-pipes-reporter/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | tika-addons 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-pipes-reporter 13 | 14 | 15 | 11 16 | 11 17 | 18 | 19 | 20 | 21 | org.postgresql 22 | postgresql 23 | 24 | 25 | org.apache.tika 26 | tika-core 27 | provided 28 | 29 | 30 | 31 | 32 | 33 | 34 | maven-shade-plugin 35 | ${maven.shade.version} 36 | 37 | 38 | package 39 | 40 | shade 41 | 42 | 43 | 44 | false 45 | 46 | 47 | 48 | *:* 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /tika-addons/tika-server-fuzzer/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | tika-addons 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-server-fuzzer 13 | 14 | 15 | 14 16 | 14 17 | 18 | 19 | 20 | 21 | org.apache.tika 22 | tika-core 23 | ${tika.version} 24 | 25 | 26 | org.apache.tika 27 | tika-fuzzing 28 | ${tika.version} 29 | 30 | 31 | org.apache.tika 32 | tika-serialization 33 | ${tika.version} 34 | 35 | 36 | org.apache.cxf 37 | cxf-rt-rs-client 38 | ${cxf.version} 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /tika-containers/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | file-observatory 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-containers 13 | pom 14 | 15 | tika-pdftotext 16 | tika-pdfchecker 17 | tika-pdfspelunker 18 | tika-pdfjs 19 | tika-arlington 20 | tika-pipes-pdfinfo 21 | tika-pipes-siegfried 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /tika-containers/tika-arlington/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER 2 | 3 | RUN apt-get update && apt-get install curl g++-8 gcc-8 cmake git -y 4 | 5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8 6 | 7 | RUN git clone https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \ 8 | cd /arlington-pdf-model && git checkout fab5b58 9 | 10 | RUN cd /arlington-pdf-model/TestGrammar && \ 11 | cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \ 12 | cmake --build cmake-linux/debug --config Debug 13 | 14 | RUN mkdir /tika-bin && cd /tika-bin && \ 15 | curl https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/2.4.1/tika-server-core-2.4.1.jar --output tika-server-core.jar 16 | 17 | 18 | FROM amd64/openjdk:11.0.8-slim-buster 19 | 20 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin 21 | 22 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest 23 | 24 | RUN mkdir /tika-bin 25 | COPY --from=GRAMMAR_CHECKER_BUILDER /tika-bin/tika-server-core.jar /tika-bin/tika-server-core.jar 26 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 27 | 28 | #once we upgrade to > tika 2.4.1, we can get rid of this custom regex parser 29 | COPY target/tika-arlington-1.0.0-SNAPSHOT.jar /tika-bin/tika-arlington.jar 30 | 31 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 32 | -------------------------------------------------------------------------------- /tika-containers/tika-arlington/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | tika-containers 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-arlington 13 | 14 | 15 | 16 | org.apache.tika 17 | tika-core 18 | provided 19 | 20 | 21 | 22 | 23 | 24 | org.apache.maven.plugins 25 | maven-shade-plugin 26 | ${maven.shade.version} 27 | 28 | 29 | package 30 | 31 | shade 32 | 33 | 34 | 35 | false 36 | 37 | 38 | 39 | *:* 40 | 41 | 42 | 43 | 44 | org.tallison.observatory.RegexCaptureParser 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /tika-containers/tika-exiftool/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: https://github.com/Miljar/exiftool-docker/blob/master/Dockerfile 2 | FROM amd64/openjdk:11.0.8-slim-buster 3 | ENV EXIFTOOL_VERSION=12.38 4 | ENV TIKA_VERSION=2.2.1 5 | 6 | RUN apk add --no-cache perl make 7 | RUN cd /tmp \ 8 | && wget http://www.sno.phy.queensu.ca/~phil/exiftool/Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \ 9 | && tar -zxvf Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz \ 10 | && cd Image-ExifTool-${EXIFTOOL_VERSION} \ 11 | && perl Makefile.PL \ 12 | && make test \ 13 | && make install \ 14 | && cd .. \ 15 | && rm -rf Image-ExifTool-${EXIFTOOL_VERSION} 16 | 17 | RUN mkdir /tika-bin \ 18 | && cd /tika-bin \ 19 | && wget https://repo1.maven.org/maven2/org/apache/tika/tika-server-core/${TIKA_VERSION}/tika-server-core-{$TIKA_VERSION}.jar 20 | 21 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 22 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 23 | 24 | #e.g. 25 | #docker run -d -p 9998:9998 -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER 5 | RUN mkdir /pdfchecker-bin 6 | 7 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz 8 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz 9 | 10 | RUN mkdir /tika-bin 11 | COPY target/tika-pdfchecker-1.0.0-SNAPSHOT.jar /tika-bin/tika-pdfchecker-1.0.0-SNAPSHOT.jar 12 | 13 | #find a more elegant way of grabbing this after we release it 14 | COPY tika-server-core-2.0.0-SNAPSHOT.jar /tika-bin/tika-server-core-2.0.0-SNAPSHOT.jar 15 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 16 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 17 | 18 | #e.g. 19 | #docker run -d -p 9998:9998 -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/my-tika-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 120000 23 | 24 | 25 | 26 | 27 | 9998 28 | 180000 29 | false 30 | 10000000 31 | 32 | -Xmx2g 33 | 34 | 35 | rmeta 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | org.tallison.tika.parsers.pdfchecker.PDFChecker -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/src/test/java/TikaPDFToTextTest.java: -------------------------------------------------------------------------------- 1 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; 2 | import org.apache.cxf.jaxrs.client.WebClient; 3 | import org.apache.tika.TikaTest; 4 | import org.apache.tika.metadata.Metadata; 5 | import org.apache.tika.metadata.serialization.JsonMetadataList; 6 | import org.junit.Ignore; 7 | import org.junit.Test; 8 | 9 | import javax.ws.rs.core.Response; 10 | import java.io.InputStream; 11 | import java.io.InputStreamReader; 12 | import java.io.Reader; 13 | import java.util.List; 14 | 15 | import static java.nio.charset.StandardCharsets.UTF_8; 16 | import static org.junit.Assert.assertEquals; 17 | 18 | public class TikaPDFToTextTest extends TikaTest { 19 | private static String END_POINT = "http://localhost:9998"; 20 | private static final String META_PATH = "/rmeta"; 21 | 22 | @Test 23 | @Ignore("once container is running") 24 | public void testBasic() throws Exception { 25 | Response response = WebClient 26 | .create(END_POINT + META_PATH) 27 | .accept("application/json") 28 | .acceptEncoding("gzip") 29 | .put(ClassLoader.getSystemResourceAsStream("test-documents/testPDF.pdf")); 30 | 31 | Reader reader = null; 32 | String encoding = response.getHeaderString("content-encoding"); 33 | if ("gzip".equals(encoding)) { 34 | reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8); 35 | } else { 36 | reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); 37 | } 38 | List metadataList = JsonMetadataList.fromJson(reader); 39 | assertEquals(1, metadataList.size()); 40 | assertEquals("born-digital", metadataList.get(0).get("pc_summary_info")); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/src/test/resources/test-documents/testPDF.pdf -------------------------------------------------------------------------------- /tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfchecker/tika-server-core-2.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /tika-containers/tika-pdfium/my-args.gn: -------------------------------------------------------------------------------- 1 | # Set build arguments here. See `gn help buildargs`. 2 | 3 | # need this to build pdfium_test 4 | pdf_is_standalone = true 5 | 6 | #other options are commented out below 7 | #use_goma = true # Googlers only. Make sure goma is installed and running first. 8 | #is_debug = true # Enable debugging features. 9 | 10 | # Set true to enable experimental Skia backend. 11 | #pdf_use_skia = false 12 | # Set true to enable experimental Skia backend (paths only). 13 | #pdf_use_skia_paths = false 14 | 15 | #pdf_enable_xfa = true # Set false to remove XFA support (implies JS support). 16 | #pdf_enable_v8 = true # Set false to remove Javascript support. 17 | #is_component_build = false # Disable component build (Though it should work) -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs-selenium/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | tika-containers 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-pdfjs-selenium 13 | 14 | 15 | 11 16 | 11 17 | 3.141.59 18 | 19 | 20 | 23 | 24 | 25 | org.seleniumhq.selenium 26 | selenium-api 27 | ${selenium.version} 28 | 29 | 30 | org.seleniumhq.selenium 31 | selenium-remote-driver 32 | ${selenium.version} 33 | 34 | 35 | org.seleniumhq.selenium 36 | selenium-server 37 | ${selenium.version} 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs-selenium/src/main/java/FirefoxSeleniumExample.java: -------------------------------------------------------------------------------- 1 | import org.openqa.selenium.firefox.FirefoxBinary; 2 | import org.openqa.selenium.firefox.FirefoxDriver; 3 | import org.openqa.selenium.firefox.FirefoxOptions; 4 | 5 | public class FirefoxSeleniumExample { 6 | public static void main(String[] args) { 7 | FirefoxBinary firefoxBinary = new FirefoxBinary(); 8 | firefoxBinary.addCommandLineOptions("--headless"); 9 | System.setProperty("webdriver.gecko.driver", "/Users/allison/tools/firefox/geckodriver"); 10 | FirefoxOptions firefoxOptions = new FirefoxOptions(); 11 | firefoxOptions.setBinary(firefoxBinary); 12 | FirefoxDriver driver = new FirefoxDriver(firefoxOptions); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:16.13.0 2 | 3 | #make sure you have enough memory to build this --memory=900 4 | RUN npm install -g gulp-cli 5 | 6 | #Option A: grab and build a specific release 7 | #RUN apt-get update && apt-get -y install wget openjdk-11-jre 8 | #RUN mkdir /builddir && cd /builddir && \ 9 | # wget https://github.com/mozilla/pdf.js/archive/refs/tags/v2.11.338.tar.gz && \ 10 | # tar -xzvf v2.11.338.tar.gz && mv pdf.js-2.11.338 pdf.js && \ 11 | # cd pdf.js && npm install && gulp dist-install && \ 12 | # rm /builddir/v2.11.338.tar.gz 13 | 14 | #Option B: build from main 15 | RUN apt-get update && apt-get -y install git openjdk-11-jre 16 | RUN mkdir /builddir && cd /builddir && \ 17 | git clone https://github.com/mozilla/pdf.js && cd pdf.js && \ 18 | npm install && gulp dist-install 19 | 20 | COPY js/my-getinfo.js /builddir/pdf.js/examples/node/my-getinfo.js 21 | 22 | # TODO: figure two stage build and what we can jettison for a smaller container 23 | 24 | RUN mkdir /tika-bin/ 25 | COPY target/tika-pdfjs-1.0.0-SNAPSHOT.jar /tika-bin/ 26 | #find a more elegant way of grabbing this after we release it 27 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/ 28 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 29 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 30 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs/src/test/resources/test-documents/test-basic.txt: -------------------------------------------------------------------------------- 1 | # Document Loaded 2 | Number of Pages: 4 3 | 4 | # Metadata Is Loaded 5 | ## Info 6 | { 7 | "PDFFormatVersion": "1.5", 8 | "Language": "en-US", 9 | "EncryptFilterName": null, 10 | "IsLinearized": false, 11 | "IsAcroFormPresent": false, 12 | "IsXFAPresent": false, 13 | "IsCollectionPresent": false, 14 | "IsSignaturesPresent": false, 15 | "Producer": "Microsoft® Word 2016", 16 | "Creator": "Microsoft® Word 2016", 17 | "CreationDate": "D:20210421211209+00'00'", 18 | "ModDate": "D:20210421211209+00'00'" 19 | } 20 | 21 | # Page 1 22 | Size: 612x792 23 | 24 | Warning: TT: undefined function: 32 25 | Warning: fetchStandardFontData: failed to fetch file "FoxitSans.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.". 26 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerif.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.". 27 | ## Text Content 28 | here is some page 1 content 29 | 30 | # Page 2 31 | Size: 612x792 32 | 33 | ## Text Content 34 | some page 2 content 35 | 36 | # Page 3 37 | Size: 612x792 38 | 39 | ## Text Content 40 | Some page 3 content 41 | 42 | # Page 4 43 | Size: 612x792 44 | 45 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.". 46 | ## Text Content 47 | Some more text 48 | 49 | # End of Document 50 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp.txt: -------------------------------------------------------------------------------- 1 | # Random Key: 765668851 2 | # Document Loaded key=765668851 3 | # Number of Pages: 2 key=765668851 4 | 5 | # Metadata Is Loaded key=765668851 6 | ## Info key=765668851 7 | { 8 | "PDFFormatVersion": "1.6", 9 | "Language": null, 10 | "EncryptFilterName": null, 11 | "IsLinearized": true, 12 | "IsAcroFormPresent": false, 13 | "IsXFAPresent": false, 14 | "IsCollectionPresent": false, 15 | "IsSignaturesPresent": false, 16 | "CreationDate": "D:20210402144320-04'00'", 17 | "Creator": "PScript5.dll Version 5.2.2", 18 | "ModDate": "D:20210402154701-04'00'", 19 | "Producer": "Acrobat Distiller 20.0 (Windows)", 20 | "Title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)" 21 | } 22 | 23 | ## Metadata key=765668851 24 | { 25 | "xmp:modifydate": "2021-04-02T15:47:01-04:00", 26 | "xmp:createdate": "2021-04-02T14:43:20-04:00", 27 | "xmp:metadatadate": "2021-04-02T15:47:01-04:00", 28 | "xmp:creatortool": "PScript5.dll Version 5.2.2", 29 | "dc:format": "application/pdf", 30 | "dc:creator": [], 31 | "dc:title": "18-956 Google LLC v. Oracle America, Inc. (04/05/2021)", 32 | "xmpmm:documentid": "uuid:1cd7d060-dd8f-463c-bfa8-18072b031ff2", 33 | "xmpmm:instanceid": "uuid:327587b5-f503-4f7a-b4b2-444c4ead47ad", 34 | "pdf:producer": "Acrobat Distiller 20.0 (Windows)" 35 | } 36 | 37 | # Page 1 key=765668851 38 | # Size: 612x792 key=765668851 39 | 40 | Info: TT: CALL empty stack (or invalid entry). 41 | Info: TT: CALL empty stack (or invalid entry). 42 | Info: TT: CALL empty stack (or invalid entry). 43 | Info: TT: CALL empty stack (or invalid entry). 44 | Warning: fetchStandardFontData: failed to fetch file "FoxitSerifBold.pfb" with "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that the "standardFontDataUrl" API parameter is provided.". 45 | Info: page=1 - getTextContent: time=141ms 46 | ## Text Content key=765668851 47 | page 1 content 48 | 49 | # Page 2 key=765668851 50 | # Size: 612x792 key=765668851 51 | 52 | Info: page=2 - getTextContent: time=33ms 53 | ## Text Content key=765668851 54 | page 2 content 55 | 56 | # End of Document key=765668851 -------------------------------------------------------------------------------- /tika-containers/tika-pdfjs/src/test/resources/test-documents/test-xmp2.txt: -------------------------------------------------------------------------------- 1 | # Random Key: 367480315 2 | # Document Loaded key=367480315 3 | # Number of Pages: 1 key=367480315 4 | 5 | # Metadata Is Loaded key=367480315 6 | 7 | ## Info key=367480315 8 | { 9 | "PDFFormatVersion": "1.5", 10 | "IsLinearized": false, 11 | "IsAcroFormPresent": true, 12 | "IsXFAPresent": false, 13 | "Trapped": { 14 | "name": "False" 15 | }, 16 | "Custom": { 17 | "PTEX.Fullbanner": "This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/Debian) kpathsea version 6.2.3" 18 | } 19 | } 20 | 21 | # Page 1 key=367480315 22 | # Size: 595.276x841.89 key=367480315 23 | Info: page=1 - getTextContent: time=40ms 24 | ## Text Content key=367480315 25 | Name Copy Reset 26 | # End of Document key=367480315 27 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/main/resources/META-INF/services/org.apache.tika.parser.Parser: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | org.tallison.tika.parsers.pdf.PDFSpelunker -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/java/org/tallison/tika/parsers/image/ICCImageParserTest.java: -------------------------------------------------------------------------------- 1 | package org.tallison.tika.parsers.image; 2 | 3 | import java.io.InputStream; 4 | 5 | import org.junit.Test; 6 | 7 | import org.apache.tika.TikaTest; 8 | import org.apache.tika.config.TikaConfig; 9 | import org.apache.tika.parser.AutoDetectParser; 10 | import org.apache.tika.parser.Parser; 11 | 12 | public class ICCImageParserTest extends TikaTest { 13 | 14 | @Test 15 | public void testBasic() throws Exception { 16 | try (InputStream is = this.getClass().getResourceAsStream("/config/my-tika-config.xml")) { 17 | Parser p = new AutoDetectParser(new TikaConfig(is)); 18 | debug(getRecursiveMetadata("baseball.jpg", p)); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/baseball.jpg -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/non-compliant1.txt: -------------------------------------------------------------------------------- 1 | Profile: 'data/iccs/7c/68/7c68fd34c873bf7db8faa3a1133d176a7c92a88f8a05d482d406857ee212ce98' 2 | Profile ID: e798cc1d9f659a6155ac35ad9ac383bb 3 | Size: 1829077(0x1be8d5) bytes 4 | 5 | Header 6 | ------ 7 | Attributes: Reflective | Glossy 8 | Cmm: Heidelberg 9 | Creation Date: 2/28/2007 08:00:00 10 | Creator: 'HDM ' = 48444D20 11 | Data Color Space: CmykData 12 | Flags EmbeddedProfileFalse | UseAnywhere 13 | PCS Color Space: LabData 14 | Platform: Unknown 15 | Rendering Intent: Relative Colorimetric 16 | Profile Class: OutputClass 17 | Profile SubClass: Not Defined 18 | Version: 2.40 19 | Illuminant: X=0.9642, Y=1.0000, Z=0.8249 20 | Spectral PCS: NoSpectralData 21 | Spectral PCS Range: Not Defined 22 | BiSpectral Range: Not Defined 23 | MCS Color Space: Not Defined 24 | 25 | Profile Tags 26 | ------------ 27 | Tag ID Offset Size Pad 28 | ---- ------ ------ ---- --- 29 | copyrightTag 'cprt' 288 103 1 30 | mediaWhitePointTag 'wtpt' 392 20 0 31 | AToB0Tag 'A2B0' 412 396852 0 32 | BToA0Tag 'B2A0' 397264 291132 0 33 | gamutTag 'gamt' 688396 33840 0 34 | AToB1Tag 'A2B1' 722236 396852 0 35 | BToA1Tag 'B2A1' 1119088 291132 0 36 | AToB2Tag 'A2B2' 412 396852 0 37 | BToA2Tag 'B2A2' 1410220 291132 0 38 | grayTRCTag 'kTRC' 1701352 524 0 39 | Unknown 'hd10' = 68643130 'hd10' 1701876 364 0 40 | profileDescriptionTag 'desc' 1702240 152 0 41 | charTargetTag 'targ' 1702392 126685 0 42 | 43 | 44 | Validation Report 45 | ----------------- 46 | Profile violates ICC specification 47 | 48 | Warning! - OutputClassTag exclusion test failed. 49 | Warning! - Unknown 'hd10' = 68643130: - Unknown Tag. 50 | NonCompliant! - File size is not a multiple of 4 bytes (last tag needs padding?). -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc1.txt: -------------------------------------------------------------------------------- 1 | Unable to parse 'data/blah.tgz' as ICC profile! 2 | 3 | Validation Report 4 | ----------------- 5 | Profile has Critical Error(s) that violate ICC specification. 6 | 7 | Error! - - Unable to read profile!** 8 | Profile has invalid structure! -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/resources/test-documents/icc-reports/not-icc2.txt: -------------------------------------------------------------------------------- 1 | Unable to parse 'data/iccs/86/20/862090af4442059ff416679acb001ae23acc18852f2dc430d0845c061b937e9c' as ICC profile! 2 | 3 | Validation Report 4 | ----------------- 5 | Profile has Critical Error(s) that violate ICC specification. 6 | 7 | NonCompliant! - Bad Header File Size 8 | Error! - - AToB0Tag - Tag has invalid structure! 9 | Error! - - AToB1Tag - Tag has invalid structure! 10 | Error! - - AToB2Tag - Tag has invalid structure! 11 | Error! - - BToA0Tag - Tag has invalid structure! 12 | Error! - - BToA1Tag - Tag has invalid structure! 13 | Error! - - BToA2Tag - Tag has invalid structure! 14 | Error! - - gamutTag - Tag has invalid structure! 15 | Error! - - Unknown 'AS00' = 41533030 - Tag has invalid structure! -------------------------------------------------------------------------------- /tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tika-containers/tika-pdfspelunker/src/test/resources/test-documents/testPDF.pdf -------------------------------------------------------------------------------- /tika-containers/tika-pdftotext/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM amd64/openjdk:11.0.8-slim-buster as POPPLER_BUILDER 5 | #poppler/data pairs 6 | #21.02.0/0.4.10 7 | #20.09.0/0.4.9 8 | #0.86.1/0.4.9 9 | 10 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 11 | RUN wget https://poppler.freedesktop.org/poppler-data-0.4.11.tar.gz \ 12 | && tar -xf poppler-data-0.4.11.tar.gz \ 13 | && cd poppler-data-0.4.11 \ 14 | && make install \ 15 | && cd .. \ 16 | && wget https://poppler.freedesktop.org/poppler-21.11.0.tar.xz \ 17 | && tar -xf poppler-21.11.0.tar.xz \ 18 | && cd poppler-21.11.0 \ 19 | && mkdir build \ 20 | && cd build \ 21 | && cmake -DENABLE_BOOST=OFF .. \ 22 | && make \ 23 | && make install \ 24 | && ldconfig 25 | #CMD tail -f /dev/null 26 | 27 | FROM amd64/openjdk:11.0.8-slim-buster 28 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 29 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 30 | 31 | RUN apt-get update && apt-get install bash ca-certificates \ 32 | libjpeg62-turbo libcairo2 libxml2 \ 33 | fontconfig liblcms2-2 \ 34 | libtiff5 -y 35 | # &&\ 36 | #libopenjpeg5 37 | #libstdc++6 && \ 38 | #addgroup -S appgroup && \ 39 | #adduser -S appuser -G appgroup -h /work && \ 40 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 41 | 42 | RUN mkdir /tika-bin 43 | 44 | #find a more elegant way of grabbing this after we release it 45 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar 46 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 47 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 48 | 49 | #e.g. 50 | #docker run -d -p 9998:9998 -------------------------------------------------------------------------------- /tika-containers/tika-pipes-pdfinfo/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=23.03.0 8 | ENV POPPLER_DATA_VERSION=0.4.12 9 | 10 | RUN apt-get update && apt-get install locales bash wget build-essential cmake libfreetype6-dev pkg-config \ 11 | libfontconfig-dev libjpeg-dev libopenjp2-7-dev \ 12 | #these are for temurin 13 | apt-transport-https gnupg -y 14 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 15 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 16 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 17 | && make install \ 18 | && cd .. \ 19 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 20 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 21 | && cd poppler-${POPPLER_VERSION} \ 22 | && mkdir build \ 23 | && cd build \ 24 | && cmake -DENABLE_BOOST=OFF ..\ 25 | && make \ 26 | && make install \ 27 | && ldconfig 28 | 29 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \ 30 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \ 31 | && apt-get update && apt-get install temurin-11-jre -y 32 | 33 | RUN mkdir /tika-bin 34 | COPY target/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar /tika-bin 35 | COPY log4j2.xml /tika-bin 36 | COPY pipes-log4j2.xml /tika-bin 37 | 38 | 39 | ENV LANG en_US.UTF-8 40 | ENV LANGUAGE en_US:en 41 | ENV LC_ALL en_US.UTF-8 42 | 43 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-pdfinfo-1.0.0-SNAPSHOT.jar"] 44 | #need to specify tika-config.xml on commandline, e.g.: 45 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345 46 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml 47 | 48 | #WORKDIR /work 49 | 50 | -------------------------------------------------------------------------------- /tika-containers/tika-pipes-pdfinfo/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /tika-containers/tika-pipes-pdfinfo/pipes-log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tika-containers/tika-pipes-siegfried/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20.2-bullseye 2 | 3 | 4 | RUN apt-get update && apt-get install file \ 5 | #these are for temurin 6 | apt-transport-https gnupg -y 7 | RUN go install github.com/richardlehane/siegfried/cmd/sf@latest && sf -update 8 | 9 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \ 10 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \ 11 | && apt-get update && apt-get install temurin-11-jre -y 12 | 13 | RUN mkdir /tika-bin 14 | COPY target/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar /tika-bin 15 | COPY log4j2.xml /tika-bin 16 | COPY pipes-log4j2.xml /tika-bin 17 | 18 | 19 | ENV LANG en_US.UTF-8 20 | ENV LANGUAGE en_US:en 21 | ENV LC_ALL en_US.UTF-8 22 | 23 | ENTRYPOINT ["java","-Dlog4j.configurationFile=/tika-bin/log4j2.xml", "-jar","/tika-bin/tika-pipes-siegfried-1.0.0-SNAPSHOT.jar"] 24 | #need to specify tika-config.xml on commandline, e.g.: 25 | #docker run -v /Users/blah/Desktop:/data -v /Users/blah/Desktop/config:/tika-config -p 2345:2345 26 | #--name tika-pipes-container tika-pipes-pdfinfo /tika-config/my-tika-config.xml 27 | 28 | #WORKDIR /work 29 | 30 | -------------------------------------------------------------------------------- /tika-containers/tika-pipes-siegfried/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /tika-containers/tika-pipes-siegfried/pipes-log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tika-containers/tika-pypdf2/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.4-slim-buster 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends \ 5 | openjdk-11-jre 6 | 7 | #TODO 8 | RUN python -m pip install --upgrade pip && pip install pypdf2==2.1.0 9 | 10 | RUN mkdir /pypdf2cli 11 | COPY scripts/PyPDF2Cli.py /pypdf2cli 12 | RUN chmod a+x /pypdf2cli/PyPDF2Cli.py 13 | 14 | RUN mkdir /tika-bin 15 | 16 | #find a more elegant way of grabbing this after we release it 17 | COPY tika-server-standard-2.1.1-SNAPSHOT.jar /tika-bin/tika-server-standard-2.1.1-SNAPSHOT.jar 18 | COPY my-tika-config.xml /tika-bin/my-tika-config.xml 19 | ENTRYPOINT ["java","-cp","/tika-bin/*", "org.apache.tika.server.core.TikaServerCli", "-h", "0.0.0.0", "-c", "/tika-bin/my-tika-config.xml"] 20 | -------------------------------------------------------------------------------- /tika-containers/tika-pypdf2/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | file-observatory 7 | org.tallison 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | tika-pypdf2 13 | 14 | 15 | 11 16 | 11 17 | 18 | 19 | -------------------------------------------------------------------------------- /tika-containers/tika-pypdf2/scripts/PyPDF2Cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from PyPDF2 import PdfReader 4 | 5 | reader = PdfReader(sys.argv[1]) 6 | 7 | # reading all the pages content one by one 8 | with open(sys.argv[2], "w", encoding="utf-8") as output: 9 | for page in reader.pages: 10 | output.write(page.extract_text()) 11 | output.write("\n") 12 | -------------------------------------------------------------------------------- /tool-runners/arlington/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster as GRAMMAR_CHECKER_BUILDER 2 | 3 | RUN apt-get update && apt-get install g++-8 gcc-8 cmake git -y 4 | 5 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8 6 | 7 | RUN git clone https://github.com/pdf-association/arlington-pdf-model /arlington-pdf-model && \ 8 | cd /arlington-pdf-model && git checkout 908a7be 9 | 10 | RUN cd /arlington-pdf-model/TestGrammar && \ 11 | cmake -B cmake-linux/debug -DPDFSDK_PDFIUM=ON -DCMAKE_BUILD_TYPE=Debug . && \ 12 | cmake --build cmake-linux/debug --config Debug 13 | 14 | 15 | FROM amd64/openjdk:11.0.8-slim-buster 16 | 17 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/TestGrammar/bin/linux /arlington-pdf-model/bin 18 | 19 | COPY --from=GRAMMAR_CHECKER_BUILDER /arlington-pdf-model/tsv/latest /arlington-pdf-model/tsv/latest 20 | 21 | COPY target/arlington-1.0.0-SNAPSHOT.jar /arlington-1.0.0-SNAPSHOT.jar 22 | 23 | 24 | ENTRYPOINT ["java","-jar","/arlington-1.0.0-SNAPSHOT.jar"] 25 | #WORKDIR /work 26 | # for debugging 27 | # docker run -it --entrypoint /bin/bash --name a2 -v /Users/.../Desktop/tool-runner-work:/data 806db3cdfa81 28 | 29 | -------------------------------------------------------------------------------- /tool-runners/arlington/env.properties: -------------------------------------------------------------------------------- 1 | TIKA_CONFIG=/config/file-obs-tika.xml 2 | #if on windows or mac, use host.docker.internal instead of localhost 3 | #make sure to include the table name after the final : 4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234 5 | NUM_THREADS=20 6 | IS_DELTA=true -------------------------------------------------------------------------------- /tool-runners/arlington/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=info, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/caradoc/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | FROM debian:stretch as CARADOC_BUILDER 3 | RUN apt-get update &&\ 4 | apt-get install -y\ 5 | ocaml\ 6 | opam\ 7 | zlib1g-dev\ 8 | libgmp-dev\ 9 | pkg-config\ 10 | m4\ 11 | zlib1g-dev\ 12 | ocaml-findlib\ 13 | libcryptokit-ocaml-dev\ 14 | libounit-ocaml-dev\ 15 | libcurses-ocaml-dev\ 16 | menhir &&\ 17 | git clone --depth=1 --single-branch https://github.com/caradoc-org/caradoc.git 18 | WORKDIR /caradoc 19 | RUN make 20 | 21 | 22 | FROM amd64/openjdk:11.0.8-slim-buster 23 | COPY --from=CARADOC_BUILDER /caradoc/_build/src/main.native /usr/local/bin/caradoc 24 | # Install dependencies for caradoc binary 25 | RUN apt-get update &&\ 26 | apt-get install -y\ 27 | libtinfo5\ 28 | libncursesw5 29 | 30 | 31 | COPY target/caradoc-1.0.0-SNAPSHOT.jar /caradoc-1.0.0-SNAPSHOT.jar 32 | ENTRYPOINT ["java","-jar","/caradoc-1.0.0-SNAPSHOT.jar"] 33 | #e.g. 34 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container 35 | # docker build -t mutool-clean-image . 36 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10 -------------------------------------------------------------------------------- /tool-runners/caradoc/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/clamav/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://github.com/mko-x/docker-clamav/blob/master/alpine/main/Dockerfile 3 | FROM alpine:3.12 4 | LABEL maintainer="Markus Kosmal " 5 | 6 | RUN apk add --no-cache openjdk11 bash clamav clamav-daemon rsyslog wget clamav-libunrar 7 | 8 | COPY conf /etc/clamav 9 | 10 | RUN mkdir /var/run/clamav && \ 11 | chown clamav:clamav /var/run/clamav && \ 12 | chmod 750 /var/run/clamav 13 | #&& \ 14 | #chown -R clamav:clamav bootstrap.sh check.sh /etc/clamav && \ 15 | #chmod u+x bootstrap.sh check.sh 16 | 17 | RUN /usr/bin/freshclam 18 | #EXPOSE 3310/tcp 19 | 20 | COPY target/clamav-1.0.0-SNAPSHOT.jar /clamav-1.0.0-SNAPSHOT.jar 21 | COPY exec.sh /exec.sh 22 | RUN ["chmod", "+x", "/exec.sh"] 23 | CMD ["/exec.sh"] 24 | -------------------------------------------------------------------------------- /tool-runners/clamav/conf/clam.conf: -------------------------------------------------------------------------------- 1 | ############### 2 | # General 3 | ############### 4 | 5 | DatabaseDirectory /var/lib/clamav 6 | TemporaryDirectory /tmp 7 | LogTime yes 8 | PidFile /run/clamav/clamd.pid 9 | LocalSocket /run/clamav/clamd.sock 10 | TCPSocket 3310 11 | Foreground no 12 | 13 | ############### 14 | # Results 15 | ############### 16 | 17 | DetectPUA yes 18 | ExcludePUA NetTool 19 | ExcludePUA PWTool 20 | AlgorithmicDetection yes 21 | Bytecode yes 22 | 23 | ############### 24 | # Scan 25 | ############### 26 | 27 | ScanPE yes 28 | DisableCertCheck yes 29 | ScanELF yes 30 | AlertBrokenExecutables yes 31 | ScanOLE2 yes 32 | ScanPDF yes 33 | ScanSWF yes 34 | ScanMail yes 35 | PhishingSignatures yes 36 | PhishingScanURLs yes 37 | ScanHTML yes 38 | ScanArchive yes 39 | 40 | ############### 41 | # Scan 42 | ############### 43 | 44 | MaxScanSize 300M 45 | MaxFileSize 100M 46 | MaxRecursion 30 47 | MaxFiles 50000 48 | MaxEmbeddedPE 40M 49 | MaxHTMLNormalize 40M 50 | MaxHTMLNoTags 2M 51 | MaxScriptNormalize 5M 52 | MaxZipTypeRcg 1M 53 | MaxPartitions 128 54 | MaxIconsPE 200 55 | PCREMatchLimit 10000 56 | PCRERecMatchLimit 10000 -------------------------------------------------------------------------------- /tool-runners/clamav/conf/freshclam.conf: -------------------------------------------------------------------------------- 1 | ############### 2 | # General 3 | ############### 4 | 5 | DatabaseDirectory /var/lib/clamav 6 | LogSyslog yes 7 | LogTime yes 8 | PidFile /run/clamav/freshclam.pid 9 | 10 | ############### 11 | # Updates 12 | ############### 13 | 14 | DatabaseMirror database.clamav.net 15 | ScriptedUpdates yes 16 | NotifyClamd /etc/clamav/clamd.conf 17 | SafeBrowsing yes 18 | Bytecode yes -------------------------------------------------------------------------------- /tool-runners/clamav/exec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #!/bin/bash 3 | # copied from: https://github.com/mko-x/docker-clamav/blob/master/alpine/main/bootstrap.sh 4 | set -e 5 | 6 | if [[ ! -z "${FRESHCLAM_CONF_FILE}" ]]; then 7 | echo "[bootstrap] FRESHCLAM_CONF_FILE set, copy to /etc/clamav/freshclam.conf" 8 | mv /etc/clamav/freshclam.conf /etc/clamav/freshclam.conf.bak 9 | cp -f ${FRESHCLAM_CONF_FILE} /etc/clamav/freshclam.conf 10 | fi 11 | 12 | if [[ ! -z "${CLAMD_CONF_FILE}" ]]; then 13 | echo "[bootstrap] CLAMD_CONF_FILE set, copy to /etc/clamav/clam.conf" 14 | mv /etc/clamav/clamd.conf /etc/clamav/clamd.conf.bak 15 | cp -f ${CLAMD_CONF_FILE} /etc/clamav/clamd.conf 16 | fi 17 | 18 | MAIN_FILE="/var/lib/clamav/main.cvd" 19 | 20 | #if [ ! -f ${MAIN_FILE} ]; then 21 | # echo "[bootstrap] Initial clam DB download." 22 | # /usr/bin/freshclam 23 | #fi 24 | 25 | #echo "[bootstrap] Schedule freshclam DB updater." 26 | #/usr/bin/freshclam -d -c 6 27 | 28 | echo "[bootstrap] Run clamav daemon..." 29 | /usr/sbin/clamd -c /etc/clamav/clam.conf 30 | echo "[bootstrap] process the files!" 31 | java -jar /clamav-1.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /tool-runners/clamav/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/env.properties: -------------------------------------------------------------------------------- 1 | TIKA_CONFIG=/config/file-obs-tika.xml 2 | #if on windows or mac, use host.docker.internal instead of localhost 3 | #make sure to include the table name after the final : 4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:5432/exploratory?user=qwertyuiop&password=password1234 5 | NUM_THREADS=20 6 | IS_DELTA=true -------------------------------------------------------------------------------- /tool-runners/fileprofiler/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster 2 | 3 | COPY target/fileprofiler-1.0.0-SNAPSHOT.jar /fileprofiler-1.0.0-SNAPSHOT.jar 4 | 5 | ENTRYPOINT ["java","-jar","/fileprofiler-1.0.0-SNAPSHOT.jar"] 6 | -------------------------------------------------------------------------------- /tool-runners/fileprofiler/README.txt: -------------------------------------------------------------------------------- 1 | Load basic provenance information -- file size, shasum, collection -------------------------------------------------------------------------------- /tool-runners/fileprofiler/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /tool-runners/gstotext/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM amd64/openjdk:11.0.8-slim-buster 3 | RUN apt-get update && apt-get install wget -y 4 | # &&\ 5 | #libopenjpeg5 6 | #libstdc++6 && \ 7 | #addgroup -S appgroup && \ 8 | #adduser -S appuser -G appgroup -h /work && \ 9 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 10 | RUN wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9550/ghostscript-9.55.0-linux-x86_64.tgz \ 11 | && tar -xf ghostscript-9.55.0-linux-x86_64.tgz 12 | 13 | COPY target/gstotext-1.0.0-SNAPSHOT.jar /gstotext-1.0.0-SNAPSHOT.jar 14 | 15 | ENTRYPOINT ["java","-jar","/gstotext-1.0.0-SNAPSHOT.jar"] 16 | #WORKDIR /work 17 | 18 | -------------------------------------------------------------------------------- /tool-runners/gstotext/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/itext/README.md: -------------------------------------------------------------------------------- 1 | This wrapper of iText's parser requires a commercial license key. 2 | 3 | This code was not written nor used with the AGPL license. 4 | 5 | Many thanks to iText for granting a custom evaluation license for this project. -------------------------------------------------------------------------------- /tool-runners/itext/src/main/resources/META-INF/services/org.apache.tika.parser.Parser: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | org.tallison.tika.parser.itext.ITextParser -------------------------------------------------------------------------------- /tool-runners/itext/src/test/resources/test-documents/testPDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/itext/src/test/resources/test-documents/testPDF.pdf -------------------------------------------------------------------------------- /tool-runners/mutoolclean/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile 3 | FROM alpine:3 as MUTOOL_BUILDER 4 | MAINTAINER Jens Fischer 5 | 6 | # install necessary packages and compile MuPDF, clean up afterwards 7 | # include bash for debugging the build only 8 | 9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary 10 | #versions 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1 11 | ENV MUTOOL_VERSION 1.19.0 12 | RUN apk add --no-cache \ 13 | git \ 14 | make \ 15 | pkgconfig \ 16 | build-base \ 17 | bash \ 18 | && git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \ 19 | && cd mupdf \ 20 | && git submodule update --init \ 21 | && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \ 22 | && cd / \ 23 | && rm -r mupdf \ 24 | && apk del \ 25 | git \ 26 | make \ 27 | pkgconfig \ 28 | build-base 29 | 30 | FROM adoptopenjdk/openjdk11:alpine-slim 31 | COPY --from=MUTOOL_BUILDER /usr/local/bin /usr/local/bin 32 | COPY --from=MUTOOL_BUILDER /lib /lib 33 | 34 | COPY target/mutoolclean-1.0.0-SNAPSHOT.jar /mutoolclean-1.0.0-SNAPSHOT.jar 35 | ENTRYPOINT ["java","-jar","/mutoolclean-1.0.0-SNAPSHOT.jar"] 36 | 37 | #e.g. 38 | #debug: docker run -it --entrypoint /bin/bash mutooltotext-container 39 | # docker build -t mutool-clean-image . 40 | # docker run -i -t --name mutool-clean-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-clean-image /opt/java/openjdk/bin/java -jar /mutoolclean-1.0.0-SNAPSHOT.jar /input /output/table.csv 10 -------------------------------------------------------------------------------- /tool-runners/mutoolclean/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/mutooltext/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://github.com/jay-eff/mutool/blob/master/Dockerfile 3 | FROM alpine:3 as MUPDF_BUILDER 4 | MAINTAINER Jens Fischer 5 | 6 | # install necessary packages and compile MuPDF, clean up afterwards 7 | # include bash for debugging the build only 8 | 9 | #get tags from here: http://git.ghostscript.com/?p=mupdf.git;a=summary 10 | #versions 1.19.0 1.18.0 1.17.0, 1.16.1, 1.16.0, 1.15.0, 1.14.0, 1.13.0, 1.12.0, 1.11.1 11 | ENV MUTOOL_VERSION 1.19.0 12 | RUN apk add --no-cache \ 13 | git \ 14 | make \ 15 | pkgconfig \ 16 | build-base \ 17 | bash \ 18 | && git clone -b ${MUTOOL_VERSION} https://github.com/ArtifexSoftware/mupdf \ 19 | && cd mupdf \ 20 | && git submodule update --init \ 21 | && make HAVE_X11=no HAVE_GLUT=no prefix=/usr/local install \ 22 | && cd / \ 23 | && rm -r mupdf \ 24 | && apk del \ 25 | git \ 26 | make \ 27 | pkgconfig \ 28 | build-base 29 | 30 | FROM adoptopenjdk/openjdk11:alpine-slim 31 | COPY --from=MUPDF_BUILDER /usr/local/bin /usr/local/bin 32 | COPY --from=MUPDF_BUILDER /lib /lib 33 | 34 | COPY target/mutooltext-1.0.0-SNAPSHOT.jar /mutooltext-1.0.0-SNAPSHOT.jar 35 | ENTRYPOINT ["java","-jar","/mutooltext-1.0.0-SNAPSHOT.jar"] 36 | #RUN apk update && apk add bash 37 | # e.g. 38 | # docker build -t mutool-text-image . 39 | # docker run -i -t --name mutool-text-container -v ~/data/input:/input:ro -v ~/data/output:/output mutool-text-image /opt/java/openjdk/bin/java -jar /mutooltotext-1.0.0-SNAPSHOT.jar /input /output/txt /output/table.csv 10 40 | -------------------------------------------------------------------------------- /tool-runners/mutooltext/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfbytes/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster 2 | 3 | COPY target/pdfbytes-1.0.0-SNAPSHOT.jar /pdfbytes-1.0.0-SNAPSHOT.jar 4 | 5 | ENTRYPOINT ["java","-jar","/pdfbytes-1.0.0-SNAPSHOT.jar"] 6 | -------------------------------------------------------------------------------- /tool-runners/pdfbytes/src/test/java/org/tallison/pdfutils/TestVersionUnpacker.java: -------------------------------------------------------------------------------- 1 | package org.tallison.pdfutils; 2 | 3 | 4 | import org.apache.tika.io.TikaInputStream; 5 | import org.junit.Test; 6 | 7 | import java.io.ByteArrayInputStream; 8 | import java.io.InputStream; 9 | import java.nio.charset.StandardCharsets; 10 | import java.nio.file.Path; 11 | import java.nio.file.Paths; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class TestVersionUnpacker { 16 | 17 | @Test 18 | public void testVersions() throws Exception { 19 | Path p = Paths.get(TestVersionUnpacker.class.getResource("/pdf-puzzle.pdf").toURI()); 20 | System.out.println(PDFByteSniffer.getJson(p)); 21 | } 22 | 23 | @Test 24 | public void testBackTracking() throws Exception { 25 | byte[] string = "%%%EO%%EOF%%EOF".getBytes(StandardCharsets.UTF_8); 26 | byte[] pattern = "%%EOF".getBytes(StandardCharsets.UTF_8); 27 | StreamSearcher streamSearcher = new StreamSearcher(pattern); 28 | InputStream is = new ByteArrayInputStream(string); 29 | System.out.println(streamSearcher.search(is)); 30 | System.out.println(streamSearcher.search(is)); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tool-runners/pdfbytes/src/test/resources/pdf-puzzle.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.1 2 | 3 | 1 0 obj 4 | << 5 | /Type /Catalog 6 | /Outlines 2 0 R 7 | /Pages 3 0 R 8 | >> 9 | endobj 10 | 11 | 2 0 obj 12 | << 13 | /Type /Outlines 14 | /Count 0 15 | >> 16 | endobj 17 | 18 | 3 0 obj 19 | << 20 | /Type /Pages 21 | /Kids [4 0 R] 22 | /Count 1 23 | >> 24 | endobj 25 | 26 | 4 0 obj 27 | << 28 | /Type /Page 29 | /Parent 3 0 R 30 | /MediaBox [0 0 612 792] 31 | /Contents 5 0 R 32 | /Resources << 33 | /ProcSet [/PDF /Text] 34 | /Font << /F1 6 0 R >> 35 | >> 36 | >> 37 | endobj 38 | 39 | 5 0 obj 40 | << 41 | /Length 89 42 | /Filter /ASCII85Decode 43 | >> 44 | stream 45 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbCgDI[]uD.RU,@;I&dE+EC!ATK:C<,*OE;u~> 46 | endstream 47 | endobj 48 | 49 | 6 0 obj 50 | << 51 | /Type /Font 52 | /Subtype /Type1 53 | /Name /F1 54 | /BaseFont /Helvetica 55 | /Encoding /MacRomanEncoding 56 | >> 57 | endobj 58 | 59 | xref 60 | 0 7 61 | 0000000000 65535 f 62 | 0000000012 00000 n 63 | 0000000089 00000 n 64 | 0000000145 00000 n 65 | 0000000214 00000 n 66 | 0000000419 00000 n 67 | 0000000594 00000 n 68 | trailer 69 | << 70 | /Size 7 71 | /Root 1 0 R 72 | >> 73 | startxref 74 | 718 75 | %%EOF 76 | 77 | 5 0 obj 78 | << 79 | /Length 89 80 | /Filter /ASCII85Decode 81 | >> 82 | stream 83 | 6<#'\7PQ#@1a#b0+>GQ(+?(u.+B2ko-rakk+E1b1F)Yf5@<6!&BlbD!=BJ[-=BJ[-=BJ[-=BJ[-=BI!p<,*OE;u~> 84 | endstream 85 | endobj 86 | 87 | xref 88 | 0 1 89 | 0000000000 65535 f 90 | 5 1 91 | 0000000935 00000 n 92 | trailer 93 | << 94 | /Size 7 95 | /Root 1 0 R 96 | /Prev 718 97 | >> 98 | startxref 99 | 1110 100 | %%EOF 101 | -------------------------------------------------------------------------------- /tool-runners/pdfchecker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster 2 | 3 | #wrapper around: https://www.datalogics.com/products/pdf-tools/pdf-checker/ 4 | #need to accept license, install it on linux and then tgz the binary 5 | #directory that is installed 6 | 7 | #I'm not including pdf-checker.tgz in my repo because of license 8 | #requirements 9 | 10 | RUN mkdir /pdfchecker-bin 11 | 12 | COPY pdf-checker.tgz /pdfchecker-bin/pdf-checker.tgz 13 | RUN cd /pdfchecker-bin && tar -xzvf pdf-checker.tgz 14 | 15 | COPY target/pdfchecker-1.0.0-SNAPSHOT.jar /pdfchecker-1.0.0-SNAPSHOT.jar 16 | # to run against a single file: 17 | #/pdfchecker-bin/PDF_Checker/pdfchecker -j /pdfchecker-bin/PDF_Checker/CheckerProfiles/everything.json -i -s 18 | ENTRYPOINT ["java","-jar","/pdfchecker-1.0.0-SNAPSHOT.jar"] -------------------------------------------------------------------------------- /tool-runners/pdfchecker/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfcpu/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile References: https://docs.docker.com/engine/reference/builder/ 2 | 3 | # Start from a golang base image 4 | FROM golang:1.16.6 as builder 5 | 6 | # install 7 | 8 | #RUN go get github.com/pdfcpu/pdfcpu/cmd/... 9 | RUN git clone -b v0.3.12 --depth 1 https://github.com/pdfcpu/pdfcpu /pdfcpu 10 | RUN cd /pdfcpu && git checkout tags/v0.3.12 -b v0.3.12-tag 11 | #WORKDIR $GOPATH/src/github.com/pdfcpu/pdfcpu/cmd/pdfcpu 12 | RUN cd /pdfcpu/cmd/pdfcpu && CGO_ENABLED=0 GOOS=linux go build -a -o pdfcpu . 13 | 14 | ######## Start a new stage from scratch ####### 15 | 16 | FROM alpine:latest 17 | 18 | RUN apk --no-cache add ca-certificates openjdk11 19 | 20 | WORKDIR /root/ 21 | 22 | # Copy the Pre-built binary file from the previous stage 23 | COPY --from=builder /pdfcpu/cmd/pdfcpu . 24 | 25 | # Command to run the executable 26 | #CMD ["./pdfcpu"] 27 | 28 | COPY target/pdfcpu-1.0.0-SNAPSHOT.jar /pdfcpu-1.0.0-SNAPSHOT.jar 29 | 30 | ENTRYPOINT ["java","-jar","/pdfcpu-1.0.0-SNAPSHOT.jar"] 31 | -------------------------------------------------------------------------------- /tool-runners/pdfcpu/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdffonts/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=21.12.0 8 | ENV POPPLER_DATA_VERSION=0.4.11 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 12 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 13 | && make install \ 14 | && cd .. \ 15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 17 | && cd poppler-${POPPLER_VERSION} \ 18 | && mkdir build \ 19 | && cd build \ 20 | && cmake -DENABLE_BOOST=OFF ..\ 21 | && make \ 22 | && make install \ 23 | && ldconfig 24 | 25 | FROM amd64/openjdk:11.0.8-slim-buster 26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 28 | 29 | RUN apt-get update && apt-get install bash ca-certificates \ 30 | libjpeg62-turbo libcairo2 libxml2 \ 31 | fontconfig liblcms2-2 \ 32 | libtiff5 -y 33 | # &&\ 34 | #libopenjpeg5 35 | #libstdc++6 && \ 36 | #addgroup -S appgroup && \ 37 | #adduser -S appuser -G appgroup -h /work && \ 38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 39 | 40 | COPY target/pdffonts-1.0.0-SNAPSHOT.jar /pdffonts-1.0.0-SNAPSHOT.jar 41 | 42 | 43 | ENTRYPOINT ["java","-jar","/pdffonts-1.0.0-SNAPSHOT.jar"] 44 | #WORKDIR /work 45 | 46 | -------------------------------------------------------------------------------- /tool-runners/pdffonts/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfid/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1-slim-buster 2 | 3 | #TODO make more efficient by factoring out a build w git, etc 4 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 /pdfid 5 | 6 | RUN apt-get update && \ 7 | apt-get install -y --no-install-recommends \ 8 | openjdk-11-jre git 9 | 10 | RUN cd /pdfid && \ 11 | git clone https://github.com/DidierStevens/DidierStevensSuite.git didierstevens && \ 12 | cd /pdfid/didierstevens && \ 13 | git checkout 5f81a8f7a8aac15b580413f6f3a2ec3d72c5d10c 14 | 15 | COPY target/pdfid-1.0.0-SNAPSHOT.jar /pdfid-1.0.0-SNAPSHOT.jar 16 | 17 | ENTRYPOINT ["java","-jar","/pdfid-1.0.0-SNAPSHOT.jar"] 18 | 19 | #for debugging 20 | #docker run -it --entrypoint /bin/bash 21 | 22 | -------------------------------------------------------------------------------- /tool-runners/pdfid/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfimages/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=21.12.0 8 | ENV POPPLER_DATA_VERSION=0.4.11 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 12 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 13 | && make install \ 14 | && cd .. \ 15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 17 | && cd poppler-${POPPLER_VERSION} \ 18 | && mkdir build \ 19 | && cd build \ 20 | && cmake -DENABLE_BOOST=OFF ..\ 21 | && make \ 22 | && make install \ 23 | && ldconfig 24 | 25 | FROM amd64/openjdk:11.0.8-slim-buster 26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 28 | 29 | RUN apt-get update && apt-get install bash ca-certificates \ 30 | libjpeg62-turbo libcairo2 libxml2 \ 31 | fontconfig liblcms2-2 \ 32 | libtiff5 -y 33 | # &&\ 34 | #libopenjpeg5 35 | #libstdc++6 && \ 36 | #addgroup -S appgroup && \ 37 | #adduser -S appuser -G appgroup -h /work && \ 38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 39 | 40 | COPY target/pdfimages-1.0.0-SNAPSHOT.jar /pdfimages-1.0.0-SNAPSHOT.jar 41 | 42 | 43 | ENTRYPOINT ["java","-jar","/pdfimages-1.0.0-SNAPSHOT.jar"] 44 | #WORKDIR /work 45 | 46 | -------------------------------------------------------------------------------- /tool-runners/pdfimages/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=DEBUG, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfinfo/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:bullseye-20230227-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=23.03.0 8 | ENV POPPLER_DATA_VERSION=0.4.12 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config \ 10 | libfontconfig-dev libjpeg-dev libopenjp2-7-dev \ 11 | #these are for temurin 12 | apt-transport-https gnupg -y 13 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 14 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 15 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 16 | && make install \ 17 | && cd .. \ 18 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 19 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 20 | && cd poppler-${POPPLER_VERSION} \ 21 | && mkdir build \ 22 | && cd build \ 23 | && cmake -DENABLE_BOOST=OFF ..\ 24 | && make \ 25 | && make install \ 26 | && ldconfig 27 | 28 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \ 29 | && echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list \ 30 | && apt-get update && apt-get install temurin-11-jre -y 31 | 32 | COPY target/pdfinfo-1.0.0-SNAPSHOT.jar /pdfinfo-1.0.0-SNAPSHOT.jar 33 | 34 | 35 | ENTRYPOINT ["java","-jar","/pdfinfo-1.0.0-SNAPSHOT.jar"] 36 | #WORKDIR /work 37 | 38 | -------------------------------------------------------------------------------- /tool-runners/pdfinfo/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfminerdump/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1-slim-buster 2 | 3 | RUN pip install pdfminer.six==20201018 4 | 5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 6 | 7 | RUN apt-get update && \ 8 | apt-get install -y --no-install-recommends \ 9 | openjdk-11-jre 10 | 11 | COPY target/pdfminerdump-1.0.0-SNAPSHOT.jar /pdfminerdump-1.0.0-SNAPSHOT.jar 12 | 13 | ENTRYPOINT ["java","-jar","/pdfminerdump-1.0.0-SNAPSHOT.jar"] 14 | 15 | -------------------------------------------------------------------------------- /tool-runners/pdfminerdump/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfminertext/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1-slim-buster 2 | 3 | RUN pip install pdfminer.six==20201018 4 | 5 | RUN mkdir -p /usr/share/man/man1 /usr/share/man/man2 6 | 7 | RUN apt-get update && \ 8 | apt-get install -y --no-install-recommends \ 9 | openjdk-11-jre 10 | 11 | COPY target/pdfminertext-1.0.0-SNAPSHOT.jar /pdfminertext-1.0.0-SNAPSHOT.jar 12 | 13 | ENTRYPOINT ["java","-jar","/pdfminertext-1.0.0-SNAPSHOT.jar"] 14 | 15 | -------------------------------------------------------------------------------- /tool-runners/pdfminertext/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdfresurrect/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends \ 5 | pdfresurrect 6 | 7 | COPY target/pdfresurrect-1.0.0-SNAPSHOT.jar /pdfresurrect-1.0.0-SNAPSHOT.jar 8 | 9 | ENTRYPOINT ["java","-jar","/pdfresurrect-1.0.0-SNAPSHOT.jar"] 10 | 11 | -------------------------------------------------------------------------------- /tool-runners/pdfresurrect/env.properties: -------------------------------------------------------------------------------- 1 | TIKA_CONFIG=/config/tika-tika-config.xml 2 | #if on windows or mac, use host.docker.internal instead of localhost 3 | #make sure to include the table name after the final : 4 | METADATA_WRITER_STRING=jdbc:postgresql://host.docker.internal:2345/somedb?user=qwertyuiop&password=qwertyuiop 5 | NUM_THREADS=20 6 | IS_DELTA=false -------------------------------------------------------------------------------- /tool-runners/pdfresurrect/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdftoppm/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=21.12.0 8 | ENV POPPLER_DATA_VERSION=0.4.11 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 12 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 13 | && make install \ 14 | && cd .. \ 15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 17 | && cd poppler-${POPPLER_VERSION} \ 18 | && mkdir build \ 19 | && cd build \ 20 | && cmake -DENABLE_BOOST=OFF ..\ 21 | && make \ 22 | && make install \ 23 | && ldconfig 24 | 25 | FROM amd64/openjdk:11.0.8-slim-buster 26 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 27 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 28 | 29 | RUN apt-get update && apt-get install bash ca-certificates \ 30 | libjpeg62-turbo libcairo2 libxml2 \ 31 | fontconfig liblcms2-2 \ 32 | libtiff5 -y 33 | # &&\ 34 | #libopenjpeg5 35 | #libstdc++6 && \ 36 | #addgroup -S appgroup && \ 37 | #adduser -S appuser -G appgroup -h /work && \ 38 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 39 | 40 | COPY target/pdftoppm-1.0.0-SNAPSHOT.jar /pdftoppm-1.0.0-SNAPSHOT.jar 41 | 42 | 43 | ENTRYPOINT ["java","-jar","/pdftoppm-1.0.0-SNAPSHOT.jar"] 44 | #WORKDIR /work 45 | -------------------------------------------------------------------------------- /tool-runners/pdftoppm/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=DEBUG, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdftops/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=21.12.0 8 | ENV POPPLER_DATA_VERSION=0.4.11 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 12 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 13 | && make install \ 14 | && cd .. \ 15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 17 | && cd poppler-${POPPLER_VERSION} \ 18 | && mkdir build \ 19 | && cd build \ 20 | && cmake -DENABLE_BOOST=OFF ..\ 21 | && make \ 22 | && make install \ 23 | && ldconfig 24 | #CMD tail -f /dev/null 25 | 26 | FROM amd64/openjdk:11.0.8-slim-buster 27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 29 | 30 | RUN apt-get update && apt-get install bash ca-certificates \ 31 | libjpeg62-turbo libcairo2 libxml2 \ 32 | fontconfig liblcms2-2 \ 33 | libtiff5 -y 34 | # &&\ 35 | #libopenjpeg5 36 | #libstdc++6 && \ 37 | #addgroup -S appgroup && \ 38 | #adduser -S appuser -G appgroup -h /work && \ 39 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 40 | 41 | COPY target/pdftops-1.0.0-SNAPSHOT.jar /pdftops-1.0.0-SNAPSHOT.jar 42 | 43 | 44 | ENTRYPOINT ["java","-jar","/pdftops-1.0.0-SNAPSHOT.jar"] 45 | #WORKDIR /work 46 | 47 | -------------------------------------------------------------------------------- /tool-runners/pdftops/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=DEBUG, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/pdftotext/Dockerfile: -------------------------------------------------------------------------------- 1 | #slight modification from: 2 | #https://stackoverflow.com/questions/61272431/installing-poppler-utils-of-version-0-82-in-docker 3 | #See also: https://github.com/quarkness/docker-poppler/blob/master/Dockerfile 4 | FROM debian:buster-20211220-slim as POPPLER_BUILDER 5 | ## can't build w 22.01.0 with the current dependencies...need to figure out how to 6 | # migrate to 22.x 7 | ENV POPPLER_VERSION=21.12.0 8 | ENV POPPLER_DATA_VERSION=0.4.11 9 | RUN apt-get update && apt-get install bash wget build-essential cmake libfreetype6-dev pkg-config libfontconfig-dev libjpeg-dev libopenjp2-7-dev -y 10 | RUN wget https://poppler.freedesktop.org/poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 11 | && tar -xf poppler-data-${POPPLER_DATA_VERSION}.tar.gz \ 12 | && cd poppler-data-${POPPLER_DATA_VERSION} \ 13 | && make install \ 14 | && cd .. \ 15 | && wget https://poppler.freedesktop.org/poppler-${POPPLER_VERSION}.tar.xz \ 16 | && tar -xf poppler-${POPPLER_VERSION}.tar.xz \ 17 | && cd poppler-${POPPLER_VERSION} \ 18 | && mkdir build \ 19 | && cd build \ 20 | && cmake -DENABLE_BOOST=OFF ..\ 21 | && make \ 22 | && make install \ 23 | && ldconfig 24 | #CMD tail -f /dev/null 25 | 26 | FROM amd64/openjdk:11.0.8-slim-buster 27 | COPY --from=POPPLER_BUILDER /usr/lib /usr/lib 28 | COPY --from=POPPLER_BUILDER /usr/local /usr/local 29 | 30 | RUN apt-get update && apt-get install bash ca-certificates \ 31 | libjpeg62-turbo libcairo2 libxml2 \ 32 | fontconfig liblcms2-2 \ 33 | libtiff5 -y 34 | # &&\ 35 | #libopenjpeg5 36 | #libstdc++6 && \ 37 | #addgroup -S appgroup && \ 38 | #adduser -S appuser -G appgroup -h /work && \ 39 | #echo "/usr/local/lib64:/lib:/usr/local/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:/lib64:/lib/x86_64-linux-gnu" > /etc/ld-musl-x86_64.path 40 | 41 | COPY target/pdftotext-1.0.0-SNAPSHOT.jar /pdftotext-1.0.0-SNAPSHOT.jar 42 | 43 | 44 | ENTRYPOINT ["java","-jar","/pdftotext-1.0.0-SNAPSHOT.jar"] 45 | #WORKDIR /work 46 | 47 | -------------------------------------------------------------------------------- /tool-runners/pdftotext/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/polyfile/Dockerfile: -------------------------------------------------------------------------------- 1 | # this should be cleaned up dramatically 2 | # I tried to build polyfile in a base container and then 3 | # copy the right bits into the final image, but I couldn't figure 4 | # out how to get all the dependencies...so this is backwards 5 | # from the other docker files: build the java first, then 6 | # copy that jar into the build container for polyfile. 7 | 8 | FROM python:3.10.4-alpine3.15 9 | RUN apk add --no-cache \ 10 | # git \ 11 | bash \ 12 | libffi-dev \ 13 | zlib \ 14 | build-base py-pip jpeg-dev zlib-dev \ 15 | openjdk11-jre 16 | # && git clone -b v0.1.6 https://github.com/trailofbits/polyfile.git 17 | 18 | 19 | ENV LIBRARY_PATH=/lib:/usr/lib 20 | 21 | #RUN cd polyfile && pip3 install -e . 22 | 23 | RUN pip3 install polyfile==0.4.2 24 | 25 | COPY target/polyfile-1.0.0-SNAPSHOT.jar /polyfile-1.0.0-SNAPSHOT.jar 26 | 27 | ENTRYPOINT ["java","-jar","/polyfile-1.0.0-SNAPSHOT.jar"] 28 | -------------------------------------------------------------------------------- /tool-runners/polyfile/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/qpdf/Dockerfile: -------------------------------------------------------------------------------- 1 | #fortunately the latest is available prebuilt (for now) 2 | # for future reference, start with something like this 3 | #curl g++ \ 4 | ## && curl -o qpdf-10.0.1.tgz https://gigenet.dl.sourceforge.net/project/qpdf/qpdf/10.0.1/qpdf-10.0.1.tar.gz \ 5 | # # && tar -xzvf qpdf-10.0.1.tgz 6 | # 7 | ##RUN cd qpdf-10.0.1 && \ 8 | # # ./configure 9 | # 10 | ##RUN make install 11 | 12 | #alpine version dictates which qpdf version is available. 13 | #see e.g. https://pkgs.alpinelinux.org/packages?name=qpdf&branch=v3.13 14 | #to search for a match 15 | FROM alpine:edge 16 | RUN apk add --no-cache \ 17 | qpdf=11.1.1-r0 \ 18 | openjdk11-jre 19 | 20 | 21 | COPY target/qpdf-1.0.0-SNAPSHOT.jar /qpdf-1.0.0-SNAPSHOT.jar 22 | 23 | ENTRYPOINT ["java","-jar","/qpdf-1.0.0-SNAPSHOT.jar"] 24 | 25 | 26 | # e.g. 27 | # docker build -t qpdf-image . 28 | 29 | # docker run --name qpdf-container --network host --env-file env.properties -v /data/docs:/input -v /data/meta/qpdf/json:/output -------------------------------------------------------------------------------- /tool-runners/qpdf/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/tika-client/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /tool-runners/tika/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amd64/openjdk:11.0.8-slim-buster 2 | 3 | 4 | RUN apt-get update && apt-get install -y wget ghostscript 5 | 6 | RUN mkdir /pkg && cd /pkg && \ 7 | wget https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && \ 8 | tar -xzvf xpdf-tools-linux-4.03.tar.gz && \ 9 | mv xpdf-tools-linux-4.03 /opt/xpdf-tools-linux-4.03 10 | 11 | RUN mkdir /usr/local/share/ghostscript && \ 12 | mkdir /usr/local/share/ghostscript/fonts 13 | 14 | COPY tgzs/xpdf-t1fonts/*.pfb /usr/local/share/ghostscript/fonts/ 15 | 16 | COPY xpdfrc /usr/local/etc/xpdfrc 17 | 18 | COPY xpdf /usr/local/share/xpdf 19 | 20 | 21 | ENV PATH "${PATH}:/opt/xpdf-tools-linux-4.03/bin64" 22 | 23 | 24 | COPY target/xpdffonts-1.0.0-SNAPSHOT.jar /xpdffonts-1.0.0-SNAPSHOT.jar 25 | 26 | 27 | ENTRYPOINT ["java","-jar","/xpdffonts-1.0.0-SNAPSHOT.jar"] 28 | #WORKDIR /work 29 | 30 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-arabic.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-arabic/README: -------------------------------------------------------------------------------- 1 | Xpdf: Arabic support package 2 | ============================ 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Arabic PDF files. 14 | 15 | Contents: 16 | - ISO-8859-6 encoding 17 | 18 | Place all of these files in a directory, typically: 19 | 20 | Unix - /usr/local/share/xpdf/arabic 21 | Win32 - C:\Program Files\xpdf\arabic 22 | 23 | Add the contents of the "add-to-xpdfrc" file to your system-wide 24 | xpdfrc config file, which is typically: 25 | 26 | Unix - /usr/local/etc/xpdfrc 27 | Win32 - C:\Program Files\xpdf\xpdfrc 28 | 29 | Alternatively, on Unix systems you can add these lines to your 30 | personal xpdfrc file in $HOME/.xpdfrc. 31 | 32 | Make sure to edit the added lines to use the actual directory where 33 | the files were installed. 34 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-arabic/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Arabic support package (2011-aug-15) 2 | unicodeMap ISO-8859-6 /usr/local/share/xpdf/arabic/ISO-8859-6.unicodeMap 3 | #----- end Arabic support package 4 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/README: -------------------------------------------------------------------------------- 1 | Xpdf: Chinese Simplified support package 2 | ======================================== 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2020-dec-22 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the 10 | GNU General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Chinese (Simplified) PDF files. 14 | 15 | Contents: 16 | - Adobe-GB1 character collection support 17 | - ISO-2022-CN encoding 18 | - EUC-CN encoding 19 | - GBK encoding 20 | 21 | Place all of these files in a directory, typically: 22 | 23 | Unix - /usr/local/share/xpdf/chinese-simplified 24 | Win32 - C:\Program Files\xpdf\chinese-simplified 25 | 26 | Add the contents of the "add-to-xpdfrc" file to your system-wide 27 | xpdfrc config file, which is typically: 28 | 29 | Unix - /usr/local/etc/xpdfrc 30 | Win32 - C:\Program Files\xpdf\xpdfrc 31 | 32 | Alternatively, on Unix systems you can add these lines to your 33 | personal xpdfrc file in $HOME/.xpdfrc. 34 | 35 | Make sure to edit the added lines to use the actual directory where 36 | the files were installed. 37 | 38 | To display PDF files that refer to non-embedded Chinese fonts, you 39 | will need to install a Chinese font. Free TrueType/OpenType fonts are 40 | available: 41 | 42 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gkai00mp.ttf.gz 43 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/gbsn00lp.ttf.gz 44 | https://www.google.com/get/noto/ 45 | 46 | After installing a Chinese font, add an appropriate "fontFileCC" line 47 | to your xpdfrc file (see the sample in "add-to-xpdfrc"). 48 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-simplified/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Chinese Simplified support package (2011-sep-02) 2 | cidToUnicode Adobe-GB1 /usr/local/share/xpdf/chinese-simplified/Adobe-GB1.cidToUnicode 3 | unicodeMap ISO-2022-CN /usr/local/share/xpdf/chinese-simplified/ISO-2022-CN.unicodeMap 4 | unicodeMap EUC-CN /usr/local/share/xpdf/chinese-simplified/EUC-CN.unicodeMap 5 | unicodeMap GBK /usr/local/share/xpdf/chinese-simplified/GBK.unicodeMap 6 | cMapDir Adobe-GB1 /usr/local/share/xpdf/chinese-simplified/CMap 7 | toUnicodeDir /usr/local/share/xpdf/chinese-simplified/CMap 8 | #fontFileCC Adobe-GB1 /usr/..../NotoSansCJKsc-Regular.otf 9 | #----- end Chinese Simplified support package 10 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/README: -------------------------------------------------------------------------------- 1 | Xpdf: Chinese Traditional support package 2 | ========================================= 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2020-dec-22 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the 10 | GNU General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Chinese (Traditional) PDF files. 14 | 15 | Contents: 16 | - Adobe-CNS1 character collection support 17 | - Big5 encoding 18 | - Big5ascii encoding (same as Big5, but includes 7-bit ASCII) 19 | 20 | Place all of these files in a directory, typically: 21 | 22 | Unix - /usr/local/share/xpdf/chinese-traditional 23 | Win32 - C:\Program Files\xpdf\chinese-traditional 24 | 25 | Add the contents of the "add-to-xpdfrc" file to your system-wide 26 | xpdfrc config file, which is typically: 27 | 28 | Unix - /usr/local/etc/xpdfrc 29 | Win32 - C:\Program Files\xpdf\xpdfrc 30 | 31 | Alternatively, on Unix systems you can add these lines to your 32 | personal xpdfrc file in $HOME/.xpdfrc. 33 | 34 | Make sure to edit the added lines to use the actual directory where 35 | the files were installed. 36 | 37 | To display PDF files that refer to non-embedded Chinese fonts, you 38 | will need to install a Chinese font. Free TrueType/OpenType fonts are 39 | available: 40 | 41 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bkai00mp.ttf.gz 42 | http://ftp.gnu.org/gnu/non-gnu/chinese-fonts-truetype/bsmi00lp.ttf.gz 43 | https://www.google.com/get/noto/ 44 | 45 | After installing a Chinese font, add an appropriate "fontFileCC" line 46 | to your xpdfrc file (see the sample in "add-to-xpdfrc"). 47 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-chinese-traditional/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Chinese Traditional support package (2011-sep-02) 2 | cidToUnicode Adobe-CNS1 /usr/local/share/xpdf/chinese-traditional/Adobe-CNS1.cidToUnicode 3 | unicodeMap Big5 /usr/local/share/xpdf/chinese-traditional/Big5.unicodeMap 4 | unicodeMap Big5ascii /usr/local/share/xpdf/chinese-traditional/Big5ascii.unicodeMap 5 | cMapDir Adobe-CNS1 /usr/local/share/xpdf/chinese-traditional/CMap 6 | toUnicodeDir /usr/local/share/xpdf/chinese-traditional/CMap 7 | #fontFileCC Adobe-CNS1 /usr/..../NotoSansCJKtc-Regular.otf" 8 | #----- end Chinese Traditional support package 9 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-cyrillic.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-cyrillic/README: -------------------------------------------------------------------------------- 1 | Xpdf: Cyrillic support package 2 | ============================== 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Cyrillic PDF files. 14 | 15 | Contents: 16 | - Bulgarian character names 17 | - KOI8-R encoding 18 | 19 | Place all of these files in a directory, typically: 20 | 21 | Unix - /usr/local/share/xpdf/cyrillic 22 | Win32 - C:\Program Files\xpdf\cyrillic 23 | 24 | Add the contents of the "add-to-xpdfrc" file to your system-wide 25 | xpdfrc config file, which is typically: 26 | 27 | Unix - /usr/local/etc/xpdfrc 28 | Win32 - C:\Program Files\xpdf\xpdfrc 29 | 30 | Alternatively, on Unix systems you can add these lines to your 31 | personal xpdfrc file in $HOME/.xpdfrc. 32 | 33 | Make sure to edit the added lines to use the actual directory where 34 | the files were installed. 35 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-cyrillic/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Cyrillic support package (2011-aug-15) 2 | nameToUnicode /usr/local/share/xpdf/cyrillic/Bulgarian.nameToUnicode 3 | unicodeMap KOI8-R /usr/local/share/xpdf/cyrillic/KOI8-R.unicodeMap 4 | #----- end Cyrillic support package 5 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-greek.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-greek/README: -------------------------------------------------------------------------------- 1 | Xpdf: Greek support package 2 | =========================== 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Greek PDF files. 14 | 15 | Contents: 16 | - Greek character names (alternates) 17 | - ISO-8859-7 encoding 18 | 19 | Place all of these files in a directory, typically: 20 | 21 | Unix - /usr/local/share/xpdf/greek 22 | Win32 - C:\Program Files\xpdf\greek 23 | 24 | Add the contents of the "add-to-xpdfrc" file to your system-wide 25 | xpdfrc config file, which is typically: 26 | 27 | Unix - /usr/local/etc/xpdfrc 28 | Win32 - C:\Program Files\xpdf\xpdfrc 29 | 30 | Alternatively, on Unix systems you can add these lines to your 31 | personal xpdfrc file in $HOME/.xpdfrc. 32 | 33 | Make sure to edit the added lines to use the actual directory where 34 | the files were installed. 35 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-greek/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Greek support package (2011-aug-15) 2 | nameToUnicode /usr/local/share/xpdf/greek/Greek.nameToUnicode 3 | unicodeMap ISO-8859-7 /usr/local/share/xpdf/greek/ISO-8859-7.unicodeMap 4 | #----- end Greek support package 5 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-hebrew.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-hebrew/README: -------------------------------------------------------------------------------- 1 | Xpdf: Hebrew support package 2 | ============================ 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Hebrew PDF files. 14 | 15 | Contents: 16 | - ISO-8859-8 encoding 17 | - Windows-1255 encoding 18 | 19 | Place all of these files in a directory, typically: 20 | 21 | Unix - /usr/local/share/xpdf/hebrew 22 | Win32 - C:\Program Files\xpdf\hebrew 23 | 24 | Add the contents of the "add-to-xpdfrc" file to your system-wide 25 | xpdfrc config file, which is typically: 26 | 27 | Unix - /usr/local/etc/xpdfrc 28 | Win32 - C:\Program Files\xpdf\xpdfrc 29 | 30 | Alternatively, on Unix systems you can add these lines to your 31 | personal xpdfrc file in $HOME/.xpdfrc. 32 | 33 | Make sure to edit the added lines to use the actual directory where 34 | the files were installed. 35 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-hebrew/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Hebrew support package (2011-aug-15) 2 | unicodeMap ISO-8859-8 /usr/local/share/xpdf/hebrew/ISO-8859-8.unicodeMap 3 | unicodeMap Windows-1255 /usr/local/share/xpdf/hebrew/Windows-1255.unicodeMap 4 | #----- end Hebrew support package 5 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-japanese.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-japanese/README: -------------------------------------------------------------------------------- 1 | Xpdf: Japanese support package 2 | ============================== 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2020-dec-22 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002-2004 Glyph & Cog, LLC, and are licensed under the 10 | GNU General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Japanese PDF files. 14 | 15 | Contents: 16 | - Adobe-Japan1 character collection support 17 | - ISO-2022-JP encoding 18 | - EUC-JP encoding 19 | - Shift-JIS encoding 20 | 21 | Place all of these files in a directory, typically: 22 | 23 | Unix - /usr/local/share/xpdf/japanese 24 | Win32 - C:\Program Files\xpdf\japanese 25 | 26 | Add the contents of the "add-to-xpdfrc" file to your system-wide 27 | xpdfrc config file, which is typically: 28 | 29 | Unix - /usr/local/etc/xpdfrc 30 | Win32 - C:\Program Files\xpdf\xpdfrc 31 | 32 | Alternatively, on Unix systems you can add these lines to your 33 | personal xpdfrc file in $HOME/.xpdfrc. 34 | 35 | Make sure to edit the added lines to use the actual directory where 36 | the files were installed. 37 | 38 | To display PDF files that refer to non-embedded Japanese fonts, you 39 | will need to install a Japanese font. Free TrueType/OpenType fonts 40 | are available: 41 | 42 | http://packages.debian.org/stable/x11/ttf-kochi-mincho 43 | http://packages.debian.org/stable/x11/ttf-kochi-gothic 44 | https://www.google.com/get/noto/ 45 | 46 | After installing a Japanese font, add an appropriate "fontFileCC" line 47 | to your xpdfrc file (see the sample in "add-to-xpdfrc"). 48 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-japanese/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Japanese support package (2011-sep-02) 2 | cidToUnicode Adobe-Japan1 /usr/local/share/xpdf/japanese/Adobe-Japan1.cidToUnicode 3 | unicodeMap ISO-2022-JP /usr/local/share/xpdf/japanese/ISO-2022-JP.unicodeMap 4 | unicodeMap EUC-JP /usr/local/share/xpdf/japanese/EUC-JP.unicodeMap 5 | unicodeMap Shift-JIS /usr/local/share/xpdf/japanese/Shift-JIS.unicodeMap 6 | cMapDir Adobe-Japan1 /usr/local/share/xpdf/japanese/CMap 7 | toUnicodeDir /usr/local/share/xpdf/japanese/CMap 8 | #fontFileCC Adobe-Japan1 /usr/..../NotoSansCJKjp-Regular.otf 9 | #----- end Japanese support package 10 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-korean.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-korean/README: -------------------------------------------------------------------------------- 1 | Xpdf: Korean support package 2 | ============================ 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2020-dec-22 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002-2005 Glyph & Cog, LLC, and are licensed under the 10 | GNU General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Korean PDF files. 14 | 15 | Contents: 16 | - Adobe-Korea1 character collection support 17 | - Adobe-KR character collection support 18 | - ISO-2022-KR encoding 19 | 20 | Place all of these files in a directory, typically: 21 | 22 | Unix - /usr/local/share/xpdf/korean 23 | Win32 - C:\Program Files\xpdf\korean 24 | 25 | Add the contents of the "add-to-xpdfrc" file to your system-wide 26 | xpdfrc config file, which is typically: 27 | 28 | Unix - /usr/local/etc/xpdfrc 29 | Win32 - C:\Program Files\Xpdf\xpdfrc 30 | 31 | Alternatively, on Unix systems you can add these lines to your 32 | personal xpdfrc file in $HOME/.xpdfrc. 33 | 34 | Make sure to edit the added lines to use the actual directory where 35 | the files were installed. 36 | 37 | To display PDF files that refer to non-embedded Korean fonts, you will 38 | need to install a Korean font. Free TrueType/OpenType fonts are 39 | available: 40 | 41 | ftp://ftp.mizi.com/pub/baekmuk/baekmuk-ttf-2.1.tar.gz 42 | https://www.google.com/get/noto/ 43 | 44 | After installing a Korean font, add appropriate "fontFileCC" 45 | lines to your xpdfrc file (see the sample in "add-to-xpdfrc"). 46 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-korean/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Korean support package (2011-sep-02) 2 | cidToUnicode Adobe-Korea1 /usr/local/share/xpdf/korean/Adobe-Korea1.cidToUnicode 3 | cidToUnicode Adobe-KR /usr/local/share/xpdf/korean/Adobe-KR.cidToUnicode 4 | unicodeMap ISO-2022-KR /usr/local/share/xpdf/korean/ISO-2022-KR.unicodeMap 5 | cMapDir Adobe-Korea1 /usr/local/share/xpdf/korean/CMap 6 | cMapDir Adobe-KR /usr/local/share/xpdf/korean/CMap 7 | toUnicodeDir /usr/local/share/xpdf/korean/CMap 8 | #fontFileCC Adobe-Korea1 /usr/..../NotoSansCJKkr-Regular.otf 9 | #fontFileCC Adobe-KR /usr/..../NotoSansCJKkr-Regular.otf 10 | #----- end Korean support package 11 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-latin2.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-latin2/README: -------------------------------------------------------------------------------- 1 | Xpdf: Latin2 support package 2 | ============================ 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Latin2 PDF files. 14 | 15 | Contents: 16 | - Latin2 encoding 17 | 18 | Place all of these files in a directory, typically: 19 | 20 | Unix - /usr/local/share/xpdf/latin2 21 | Win32 - C:\Program Files\xpdf\latin2 22 | 23 | Add the contents of the "add-to-xpdfrc" file to your system-wide 24 | xpdfrc config file, which is typically: 25 | 26 | Unix - /usr/local/etc/xpdfrc 27 | Win32 - C:\Program Files\xpdf\xpdfrc 28 | 29 | Alternatively, on Unix systems you can add these lines to your 30 | personal xpdfrc file in $HOME/.xpdfrc. 31 | 32 | Make sure to edit the added lines to use the actual directory where 33 | the files were installed. 34 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-latin2/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Latin2 support package (2011-aug-15) 2 | unicodeMap Latin2 /usr/local/share/xpdf/latin2/Latin2.unicodeMap 3 | #----- end Latin2 support package 4 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-t1fonts/README: -------------------------------------------------------------------------------- 1 | This package contains two fonts: 2 | 3 | s050000l.pfb -- Symbol 4 | d050000l.pfb -- Zapf Dingbats 5 | 6 | These fonts are substitutes for the corresponding Base-14 fonts. They 7 | are part of the font set contributed to the ghostscript project by 8 | URW++ Design and Development Incorporated of Hamburg, Germany 9 | (http://www.urwpp.de/). They have been released under the GNU General 10 | Public License (GPL) v2 -- see the "COPYING" file. 11 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/d050000l.pfb -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-t1fonts/s050000l.pfb -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-thai.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-thai/README: -------------------------------------------------------------------------------- 1 | Xpdf: Thai support package 2 | ========================== 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Thai PDF files. 14 | 15 | Contents: 16 | - Thai character names 17 | - TIS-620 encoding 18 | 19 | Place all of these files in a directory, typically: 20 | 21 | Unix - /usr/local/share/xpdf/thai 22 | Win32 - C:\Program Files\xpdf\thai 23 | 24 | Add the contents of the "add-to-xpdfrc" file to your system-wide 25 | xpdfrc config file, which is typically: 26 | 27 | Unix - /usr/local/etc/xpdfrc 28 | Win32 - C:\Program Files\xpdf\xpdfrc 29 | 30 | Alternatively, on Unix systems you can add these lines to your 31 | personal xpdfrc file in $HOME/.xpdfrc. 32 | 33 | Make sure to edit the added lines to use the actual directory where 34 | the files were installed. 35 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-thai/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Thai support package (2011-aug-15) 2 | nameToUnicode /usr/local/share/xpdf/thai/Thai.nameToUnicode 3 | unicodeMap TIS-620 /usr/local/share/xpdf/thai/TIS-620.unicodeMap 4 | #----- end Thai support package 5 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tballison/file-observatory/5677da227b1a70b1220ae05005b6064210858504/tool-runners/xpdffonts/tgzs/xpdf-turkish.tar.gz -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-turkish/README: -------------------------------------------------------------------------------- 1 | Xpdf: Turkish support package 2 | ============================= 3 | 4 | Xpdf project: http://www.foolabs.com/xpdf/ 5 | 2011-aug-15 6 | 7 | If this package includes CMap files, they contain their own copyright 8 | notices and distribution conditions. All other files in the package 9 | are Copyright 2002 Glyph & Cog, LLC, and are licensed under the GNU 10 | General Public License (GPL), version 2 or 3. 11 | 12 | This package provides support files needed to use the Xpdf tools with 13 | Turkish PDF files. 14 | 15 | Contents: 16 | - ISO-8859-9 encoding 17 | 18 | Place all of these files in a directory, typically: 19 | 20 | Unix - /usr/local/share/xpdf/turkish 21 | Win32 - C:\Program Files\xpdf\turkish 22 | 23 | Add the contents of the "add-to-xpdfrc" file to your system-wide 24 | xpdfrc config file, which is typically: 25 | 26 | Unix - /usr/local/etc/xpdfrc 27 | Win32 - C:\Program Files\xpdf\xpdfrc 28 | 29 | Alternatively, on Unix systems you can add these lines to your 30 | personal xpdfrc file in $HOME/.xpdfrc. 31 | 32 | Make sure to edit the added lines to use the actual directory where 33 | the files were installed. 34 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/tgzs/xpdf-turkish/add-to-xpdfrc: -------------------------------------------------------------------------------- 1 | #----- begin Turkish support package (2011-aug-15) 2 | unicodeMap ISO-8859-9 /usr/local/share/xpdf/turkish/ISO-8859-9.unicodeMap 3 | #----- end Turkish support package 4 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/arabic/ISO-8859-6.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 00a0 20 5 | 00a4 a4 6 | 00ad ad 7 | 02c6 5e 8 | 02dc 7e 9 | 060c 060c ac 10 | 061b 061b bb 11 | 061f 061f bf 12 | 0621 063a c1 13 | 0640 0652 e0 14 | 2013 2013 ad 15 | 2014 2014 2d2d 16 | 2018 2018 60 17 | 2019 2019 27 18 | 201a 201a 2c 19 | 201c 201c 22 20 | 201d 201d 22 21 | 201e 201e 2c2c 22 | 2026 2026 2e2e2e 23 | 2039 2039 3c 24 | 203a 203a 3e 25 | 2044 2044 2f 26 | 2122 2122 544d 27 | 2212 2212 2d 28 | f6f9 f6f9 4c 29 | f6fe f6fe 7e 30 | f721 f721 21 31 | f724 f724 24 32 | f726 f726 26 33 | f730 f739 30 34 | f73f f73f 3f 35 | f761 f77a 41 36 | fb00 fb00 6666 37 | fb01 fb01 6669 38 | fb02 fb02 666c 39 | fb03 fb03 666669 40 | fb04 fb04 66666c 41 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/GBpc-EUC-UCS2: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 Resource-CMap %%DocumentNeededResources: ProcSet (CIDInit) %%DocumentNeededResources: CMap (GBpc-EUC-UCS2C) %%IncludeResource: ProcSet (CIDInit) %%IncludeResource: CMap (GBpc-EUC-UCS2C) %%BeginResource: CMap (GBpc-EUC-UCS2) %%Title: (GBpc-EUC-UCS2) %%Version: 4.002 %%Copyright: ----------------------------------------------------------- %%Copyright: Copyright 1990-1997 Adobe Systems Incorporated. %%Copyright: All Rights Reserved. %%Copyright: %%Copyright: Patents Pending %%Copyright: %%Copyright: NOTICE: All information contained herein is the property %%Copyright: of Adobe Systems Incorporated. %%Copyright: %%Copyright: Permission is granted for redistribution of this file %%Copyright: provided this copyright notice is maintained intact and %%Copyright: that the contents of this file are not altered in any %%Copyright: way from its original form. %%Copyright: %%Copyright: PostScript and Display PostScript are trademarks of %%Copyright: Adobe Systems Incorporated which may be registered in %%Copyright: certain jurisdictions. %%Copyright: ----------------------------------------------------------- %%EndComments /CIDInit /ProcSet findresource begin 12 dict begin begincmap /GBpc-EUC-UCS2C usecmap /CIDSystemInfo 3 dict dup begin /Registry (Adobe) def /Ordering (GBpc_EUC_UCS2) def /Supplement 2 def end def /CMapName /GBpc-EUC-UCS2 def /CMapVersion 4.002 def /CMapType 1 def /WMode 0 def 1 beginbfrange <006e0300> endbfrange endcmap CMapName currentdict /CMap defineresource pop end end %%EndResource %%EOF -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/chinese-simplified/CMap/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 1990-2019 Adobe. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | Neither the name of Adobe nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/B5pc-UCS2: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 Resource-CMap %%DocumentNeededResources: ProcSet (CIDInit) %%DocumentNeededResources: CMap (B5pc-UCS2) %%IncludeResource: ProcSet (CIDInit) %%IncludeResource: CMap (B5pc-UCS2C) %%BeginResource: CMap (B5pc-UCS2) %%Title: (B5pc-UCS2) %%Version: 4.002 %%Copyright: ----------------------------------------------------------- %%Copyright: Copyright 1990-1997 Adobe Systems Incorporated. %%Copyright: All Rights Reserved. %%Copyright: %%Copyright: Patents Pending %%Copyright: %%Copyright: NOTICE: All information contained herein is the property %%Copyright: of Adobe Systems Incorporated. %%Copyright: %%Copyright: Permission is granted for redistribution of this file %%Copyright: provided this copyright notice is maintained intact and %%Copyright: that the contents of this file are not altered in any %%Copyright: way from its original form. %%Copyright: %%Copyright: PostScript and Display PostScript are trademarks of %%Copyright: Adobe Systems Incorporated which may be registered in %%Copyright: certain jurisdictions. %%Copyright: ----------------------------------------------------------- %%EndComments /CIDInit /ProcSet findresource begin 12 dict begin begincmap /B5pc-UCS2C usecmap /CIDSystemInfo 3 dict dup begin /Registry (Adobe) def /Ordering (B5pc_UCS2) def /Supplement 0 def end def /CMapName /B5pc-UCS2 def /CMapVersion 4.002 def /CMapType 1 def /WMode 0 def endcmap CMapName currentdict /CMap defineresource pop end end %%EndResource %%EOF -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/chinese-traditional/CMap/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 1990-2019 Adobe. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | Neither the name of Adobe nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/cyrillic/Bulgarian.nameToUnicode: -------------------------------------------------------------------------------- 1 | 0410 As 2 | 0411 Buki 3 | 0412 Wjedi 4 | 0413 Glagol 5 | 0414 Dobro 6 | 0415 Jest 7 | 0416 Schiwete 8 | 0417 Selmja 9 | 0418 Ische 10 | 0419 Ischebreve 11 | 041a Kako 12 | 041b Ljudi 13 | 041c Muislete 14 | 041d Nasche 15 | 041e On 16 | 041f Pakoj 17 | 0420 Rzui 18 | 0421 Slovo 19 | 0422 Twerdo 20 | 0423 Uk 21 | 0424 Fert 22 | 0425 Cherr 23 | 0426 Zui 24 | 0427 Tscherw 25 | 0428 Scha 26 | 0429 Schtscha 27 | 042a Jerr 28 | 042e Ju 29 | 042f Ja 30 | 0430 as 31 | 0431 buki 32 | 0432 wjedi 33 | 0433 glagol 34 | 0434 dobro 35 | 0435 jest 36 | 0436 schiwete 37 | 0437 selmja 38 | 0438 ische 39 | 0439 ischebreve 40 | 043a kako 41 | 043b ljudi 42 | 043c muislete 43 | 043d nasche 44 | 043e on 45 | 043f pakoj 46 | 0440 rzui 47 | 0441 slovo 48 | 0442 twerdo 49 | 0443 uk 50 | 0444 fert 51 | 0445 cherr 52 | 0446 zui 53 | 0447 tscherw 54 | 0448 scha 55 | 0449 schtscha 56 | 044a jerr 57 | 044e ju 58 | 044f ja 59 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/cyrillic/KOI8-R.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 9a 5 | 00a9 bf 6 | 00b0 9c 7 | 00b2 9d 8 | 00b7 9e 9 | 00f7 9f 10 | 02c6 5e 11 | 02da 9c 12 | 02dc 7e 13 | 0401 b3 14 | 0410 0411 e1 15 | 0412 f7 16 | 0413 e7 17 | 0414 0415 e4 18 | 0416 f6 19 | 0417 fa 20 | 0418 041f e9 21 | 0420 0423 f2 22 | 0424 e6 23 | 0425 e8 24 | 0426 e3 25 | 0427 fe 26 | 0428 fb 27 | 0429 fd 28 | 042a ff 29 | 042b f9 30 | 042c f8 31 | 042d fc 32 | 042e e0 33 | 042f f1 34 | 0430 0431 c1 35 | 0432 d7 36 | 0433 c7 37 | 0434 0435 c4 38 | 0436 d6 39 | 0437 da 40 | 0438 c9 41 | 0439 043f ca 42 | 0440 0443 d2 43 | 0444 c6 44 | 0445 c8 45 | 0446 c3 46 | 0447 de 47 | 0448 db 48 | 0449 dd 49 | 044a df 50 | 044b d9 51 | 044c d8 52 | 044d dc 53 | 044e c0 54 | 044f d1 55 | 0451 a3 56 | 2013 2d 57 | 2014 2d2d 58 | 2018 60 59 | 2019 27 60 | 201a 2c 61 | 201c 22 62 | 201d 22 63 | 201e 2c2c 64 | 2022 9e 65 | 2026 2e2e2e 66 | 2039 3c 67 | 203a 3e 68 | 2044 2f 69 | 2122 544d 70 | 2212 2d 71 | 2219 221a 95 72 | 2248 97 73 | 2264 2265 98 74 | 2320 93 75 | 2321 9b 76 | 2500 80 77 | 2502 81 78 | 250c 82 79 | 2510 83 80 | 2514 84 81 | 2518 85 82 | 251c 86 83 | 2524 87 84 | 252c 88 85 | 2534 89 86 | 253c 8a 87 | 2550 2552 a0 88 | 2553 2561 a4 89 | 2562 256c b4 90 | 2580 8b 91 | 2584 8c 92 | 2588 8d 93 | 258c 8e 94 | 2590 2593 8f 95 | 25a0 94 96 | fb00 6666 97 | fb01 6669 98 | fb02 666c 99 | fb03 666669 100 | fb04 66666c 101 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/greek/Greek.nameToUnicode: -------------------------------------------------------------------------------- 1 | 0396 Dzeta 2 | 039e Ksi 3 | 039f Omikron 4 | 03a7 Khi 5 | 03b2 betatwo 6 | 03b6 dzeta 7 | 03be ksi 8 | 03bf omikron 9 | 03c3 sigmafinal 10 | 03c6 phitwo 11 | 03c7 khi 12 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/greek/ISO-8859-7.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 a0 5 | 00a3 a3 6 | 00a6 00a9 a6 7 | 00ab 00ad ab 8 | 00b0 00b4 b0 9 | 00b5 ec 10 | 00b7 b7 11 | 00bb bb 12 | 00bd bd 13 | 02c6 5e 14 | 02da b0 15 | 02dc 7e 16 | 0374 b4 17 | 037e 3b 18 | 0384 038a b4 19 | 038c bc 20 | 038e 03a1 be 21 | 03a3 03ce d3 22 | 03d0 e2 23 | 03d1 e8 24 | 03d2 d5 25 | 03d3 be 26 | 03d4 db 27 | 03d5 f6 28 | 03d6 f0 29 | 03d7 eae1e9 30 | 03da d3d4 31 | 03db f3f4 32 | 03f0 ea 33 | 03f1 f1 34 | 03f2 63 35 | 03f3 6a 36 | 03f4 c8 37 | 03f5 e5 38 | 2013 ad 39 | 2014 af 40 | 2018 60 41 | 2019 a2 42 | 201a 2c 43 | 201b a1 44 | 201c 22 45 | 201d 22 46 | 201e 2c2c 47 | 2022 b7 48 | 2026 2e2e2e 49 | 2039 3c 50 | 203a 3e 51 | 2044 2f 52 | 20ac c5f5f1fe 53 | 20af c4f1f7 54 | 2122 544d 55 | 2126 d9 56 | 2206 c4 57 | 2212 2d 58 | 2219 b7 59 | fb00 6666 60 | fb01 6669 61 | fb02 666c 62 | fb03 666669 63 | fb04 66666c 64 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/hebrew/ISO-8859-8.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 00a0 20 5 | 00a2 00a9 a2 6 | 00ab 00b9 ab 7 | 00bb 00be bb 8 | 010c 43 9 | 010d 63 10 | 0131 69 11 | 0141 4c 12 | 0142 6c 13 | 0152 4f45 14 | 0153 6f65 15 | 0160 53 16 | 0161 73 17 | 0178 59 18 | 017d 5a 19 | 017e 7a 20 | 02c6 5e 21 | 02da b0 22 | 02dc 7e 23 | 05d0 05ea e0 24 | 05f0 e5e5 25 | 05f1 e5e9 26 | 05f2 e9e9 27 | 2013 ad 28 | 2014 2d2d 29 | 2018 60 30 | 2019 27 31 | 201a 2c 32 | 201c 22 33 | 201d 22 34 | 201e 2c2c 35 | 2022 b7 36 | 2026 2e2e2e 37 | 2039 3c 38 | 203a 3e 39 | 2044 2f 40 | 2122 544d 41 | 2212 2d 42 | f6f9 4c 43 | f6fa 4f45 44 | f6fc b0 45 | f6fd 53 46 | f6fe 7e 47 | f6ff 5a 48 | f721 21 49 | f724 24 50 | f726 26 51 | f730 f739 30 52 | f73f 3f 53 | f761 f77a 41 54 | f7a1 f7a2 a1 55 | f7bf bf 56 | f7e0 f7f6 c0 57 | f7f8 f7fe d8 58 | f7ff 59 59 | fb00 6666 60 | fb01 6669 61 | fb02 666c 62 | fb03 666669 63 | fb04 66666c 64 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/hebrew/Windows-1255.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 00a3 a0 5 | 00a5 00a9 a5 6 | 00ab 00b9 ab 7 | 00bb 00bf bb 8 | 00d7 aa 9 | 00f7 ba 10 | 010c 43 11 | 010d 63 12 | 0131 69 13 | 0141 4c 14 | 0142 6c 15 | 0152 4f45 16 | 0153 6f65 17 | 0160 53 18 | 0161 73 19 | 0178 59 20 | 017d 5a 21 | 017e 7a 22 | 0192 83 23 | 02c6 88 24 | 02da b0 25 | 02dc 98 26 | 05b0 05b9 c0 27 | 05bb 05c3 cb 28 | 05f0 05f4 d4 29 | 05d0 05ea e0 30 | 200e 200f fd 31 | 2013 2014 96 32 | 2018 2019 91 33 | 201a 82 34 | 201c 201d 93 35 | 201e 84 36 | 2020 86 37 | 2021 87 38 | 2022 95 39 | 2026 85 40 | 2030 89 41 | 2039 8b 42 | 203a 9b 43 | 2044 2f 44 | 20aa a4 45 | 20ac 80 46 | 2122 99 47 | 2212 2d 48 | f6f9 4c 49 | f6fa 4f45 50 | f6fc b0 51 | f6fd 53 52 | f6fe 7e 53 | f6ff 5a 54 | f721 21 55 | f724 24 56 | f726 26 57 | f730 f739 30 58 | f73f 3f 59 | f761 f77a 41 60 | f7a1 f7a2 a1 61 | f7bf bf 62 | fb00 6666 63 | fb01 6669 64 | fb02 666c 65 | fb03 666669 66 | fb04 66666c 67 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/japanese/CMap/90pv-RKSJ-UCS2: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 Resource-CMap %%DocumentNeededResources: ProcSet (CIDInit) %%DocumentNeededResources: CMap (90pv-RKSJ-UCS2C) %%IncludeResource: ProcSet (CIDInit) %%IncludeResource: CMap (90pv-RKSJ-UCS2C) %%BeginResource: CMap (90pv-RKSJ-UCS2) %%Title: (90pv-RKSJ-UCS2) %%Version: 4.002 %%Copyright: ----------------------------------------------------------- %%Copyright: Copyright 1990-1997 Adobe Systems Incorporated. %%Copyright: All Rights Reserved. %%Copyright: %%Copyright: Patents Pending %%Copyright: %%Copyright: NOTICE: All information contained herein is the property %%Copyright: of Adobe Systems Incorporated. %%Copyright: %%Copyright: Permission is granted for redistribution of this file %%Copyright: provided this copyright notice is maintained intact and %%Copyright: that the contents of this file are not altered in any %%Copyright: way from its original form. %%Copyright: %%Copyright: PostScript and Display PostScript are trademarks of %%Copyright: Adobe Systems Incorporated which may be registered in %%Copyright: certain jurisdictions. %%Copyright: ----------------------------------------------------------- %%EndComments /CIDInit /ProcSet findresource begin 12 dict begin begincmap /90pv-RKSJ-UCS2C usecmap /CIDSystemInfo 3 dict dup begin /Registry (Adobe) def /Ordering (90pv_RKSJ_UCS2) def /Supplement 2 def end def /CMapName /90pv-RKSJ-UCS2 def /CMapVersion 4.002 def /CMapType 1 def /WMode 0 def 18 beginbfrange <8591> <8591> <85ab> <85ab> <85ac> <85ac> <85ad> <85ad> <85bf> <85bf> <85c0> <85c0> <85c1> <85c1> <865d> <865d> <869e> <869e> <86d4> <86d4> <21e6f87a> <86d5> <86d5> <21e7f87a> <86d6> <86d6> <21e9f87a> <86ce> <86ce> <8791> <8791> <592720dd> <8792> <8792> <5c0f20dd> <879d> <879d> <63a720dd> <87fb> <87fb> <87fc> <87fc> endbfrange endcmap CMapName currentdict /CMap defineresource pop end end %%EndResource %%EOF -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/japanese/CMap/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 1990-2019 Adobe. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | Neither the name of Adobe nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/korean/CMap/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 1990-2019 Adobe. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | Neither the name of Adobe nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/latin2/Latin2.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 00a0 20 5 | 00a4 a4 6 | 00a7 00a8 a7 7 | 00ad ad 8 | 00b0 b0 9 | 00b4 b4 10 | 00b8 b8 11 | 00c1 00c2 c1 12 | 00c4 c4 13 | 00c7 c7 14 | 00c9 c9 15 | 00cb cb 16 | 00cd 00ce cd 17 | 00d3 00d4 d3 18 | 00d6 00d7 d6 19 | 00da da 20 | 00dc 00dd dc 21 | 00df df 22 | 00e1 00e2 e1 23 | 00e4 e4 24 | 00e7 e7 25 | 00e9 e9 26 | 00eb eb 27 | 00ed 00ee ed 28 | 00f3 00f4 f3 29 | 00f6 00f7 f6 30 | 00fa fa 31 | 00fc 00fd fc 32 | 0102 c3 33 | 0103 e3 34 | 0104 a1 35 | 0105 b1 36 | 0106 c6 37 | 0107 e6 38 | 010c c8 39 | 010d e8 40 | 010e cf 41 | 010f ef 42 | 0110 d0 43 | 0111 f0 44 | 0118 ca 45 | 0119 ea 46 | 011a cc 47 | 011b ec 48 | 0131 69 49 | 0139 c5 50 | 013a e5 51 | 013d a5 52 | 013e b5 53 | 0141 a3 54 | 0142 b3 55 | 0143 d1 56 | 0144 f1 57 | 0147 d2 58 | 0148 f2 59 | 0150 d5 60 | 0151 f5 61 | 0152 4f45 62 | 0153 6f65 63 | 0154 c0 64 | 0155 e0 65 | 0158 d8 66 | 0159 f8 67 | 015a a6 68 | 015b b6 69 | 015e aa 70 | 015f ba 71 | 0160 a9 72 | 0161 b9 73 | 0162 de 74 | 0163 fe 75 | 0164 ab 76 | 0165 bb 77 | 016e d9 78 | 016f f9 79 | 0170 db 80 | 0171 fb 81 | 0178 59 82 | 0179 ac 83 | 017a bc 84 | 017b af 85 | 017c bf 86 | 017d ae 87 | 017e be 88 | 02c6 5e 89 | 02c7 b7 90 | 02d8 a2 91 | 02d9 ff 92 | 02da b0 93 | 02db b2 94 | 02dc 7e 95 | 02dd bd 96 | 2013 2013 ad 97 | 2014 2014 2d2d 98 | 2018 2018 60 99 | 2019 2019 27 100 | 201a 201a 2c 101 | 201c 201c 22 102 | 201d 201d 22 103 | 201e 201e 2c2c 104 | 2022 2022 b7 105 | 2026 2026 2e2e2e 106 | 2039 2039 3c 107 | 203a 203a 3e 108 | 2044 2044 2f 109 | 2122 2122 544d 110 | 2212 2212 2d 111 | f6f9 f6f9 4c 112 | f6fa f6fa 4f45 113 | f6fc f6fc b0 114 | f6fd f6fd 53 115 | f6fe f6fe 7e 116 | f6ff f6ff 5a 117 | f721 f721 21 118 | f724 f724 24 119 | f726 f726 26 120 | f730 f739 30 121 | f73f f73f 3f 122 | f761 f77a 41 123 | f7a1 f7a2 a1 124 | f7bf f7bf bf 125 | f7e0 f7f6 c0 126 | f7f8 f7fe d8 127 | f7ff f7ff 59 128 | fb00 fb00 6666 129 | fb01 fb01 6669 130 | fb02 fb02 666c 131 | fb03 fb03 666669 132 | fb04 fb04 66666c 133 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/thai/TIS-620.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 20 5 | 0131 69 6 | 0141 4c 7 | 0142 6c 8 | 0152 4f45 9 | 0153 6f65 10 | 0160 53 11 | 0161 73 12 | 0178 59 13 | 017d 5a 14 | 017e 7a 15 | 02c6 5e 16 | 02dc 7e 17 | 0e01 0e3a a1 18 | 0e3f 0e5b df 19 | 2013 2d2d 20 | 2014 2d2d 21 | 2018 60 22 | 2019 27 23 | 201a 2c 24 | 201c 22 25 | 201d 22 26 | 201e 2c2c 27 | 2022 2a 28 | 2026 2e2e2e 29 | 2039 3c 30 | 203a 3e 31 | 2044 2f 32 | 2122 544d 33 | 2212 2d 34 | f700 b0 35 | f701 f704 d4 36 | f705 f709 e8 37 | f70a f70e e8 38 | f70f ad 39 | f710 d1 40 | f711 ed 41 | f712 f717 e7 42 | f718 f71a d8 43 | fb00 6666 44 | fb01 6669 45 | fb02 666c 46 | fb03 666669 47 | fb04 66666c 48 | -------------------------------------------------------------------------------- /tool-runners/xpdffonts/xpdf/turkish/ISO-8859-9.unicodeMap: -------------------------------------------------------------------------------- 1 | 000a 0a 2 | 000c 000d 0c 3 | 0020 007e 20 4 | 00a0 20 5 | 00a1 00ac a1 6 | 00ae 00cf ae 7 | 00d1 00dc d1 8 | 00df 00ef df 9 | 00f1 00fc f1 10 | 00ff ff 11 | 010c 43 12 | 010d 63 13 | 011e d0 14 | 011f f0 15 | 0130 dd 16 | 0131 fd 17 | 0141 4c 18 | 0142 6c 19 | 0152 4f45 20 | 0153 6f65 21 | 015e de 22 | 015f fe 23 | 0160 53 24 | 0161 73 25 | 0178 59 26 | 017d 5a 27 | 017e 7a 28 | 02c6 5e 29 | 02da b0 30 | 02dc 7e 31 | 2013 ad 32 | 2014 2d2d 33 | 2018 60 34 | 2019 27 35 | 201a 2c 36 | 201c 22 37 | 201d 22 38 | 201e 2c2c 39 | 2022 b7 40 | 2026 2e2e2e 41 | 2039 3c 42 | 203a 3e 43 | 2044 2f 44 | 2122 544d 45 | 2212 2d 46 | f6f9 4c 47 | f6fa 4f45 48 | f6fc b0 49 | f6fd 53 50 | f6fe 7e 51 | f6ff 5a 52 | f721 21 53 | f724 24 54 | f726 26 55 | f730 f739 30 56 | f73f 3f 57 | f761 f77a 41 58 | f7a1 f7a2 a1 59 | f7bf bf 60 | f7e0 f7f6 c0 61 | f7f8 f7fe d8 62 | f7ff 59 63 | fb00 6666 64 | fb01 6669 65 | fb02 666c 66 | fb03 666669 67 | fb04 66666c 68 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/db/ExtractsToDB.java: -------------------------------------------------------------------------------- 1 | package org.tallison.db; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | 7 | import org.apache.tika.exception.TikaConfigException; 8 | import org.apache.tika.pipes.pipesiterator.PipesIterator; 9 | 10 | public class ExtractsToDB { 11 | 12 | public static void main(String[] args) throws Exception { 13 | Path tikaConfigFile = Paths.get(args[0]); 14 | 15 | PipesIterator it = PipesIterator.build(tikaConfigFile); 16 | 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/db/FetchFilesFromDBPaths.java: -------------------------------------------------------------------------------- 1 | package org.tallison.db; 2 | 3 | import java.io.InputStream; 4 | import java.nio.file.Files; 5 | import java.nio.file.Path; 6 | import java.nio.file.Paths; 7 | import java.nio.file.StandardCopyOption; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | import org.apache.tika.metadata.Metadata; 12 | import org.apache.tika.pipes.FetchEmitTuple; 13 | import org.apache.tika.pipes.fetcher.Fetcher; 14 | import org.apache.tika.pipes.fetcher.FetcherManager; 15 | import org.apache.tika.pipes.pipesiterator.PipesIterator; 16 | 17 | public class FetchFilesFromDBPaths { 18 | 19 | public static void main(String[] args) throws Exception { 20 | Path tikaConfigFile = Paths.get("/Users/allison/Desktop/tika-config.xml"); 21 | PipesIterator pipesIterator = PipesIterator.build(tikaConfigFile); 22 | Fetcher fetcher = FetcherManager.load(tikaConfigFile).getFetcher("s3f"); 23 | Path outputRoot = Paths.get("/Users/allison/Desktop/clam-pdfs"); 24 | 25 | for (FetchEmitTuple t : pipesIterator) { 26 | String clamav = t.getMetadata().get("clamav_detect"); 27 | Matcher m = Pattern.compile("([0-9a-f]{10,})").matcher(t.getFetchKey().getFetchKey()); 28 | String sha256 = ""; 29 | if (m.find()) { 30 | sha256 = m.group(1); 31 | } 32 | Path targ = outputRoot.resolve(clamav).resolve(sha256); 33 | if (Files.isRegularFile(targ)) { 34 | continue; 35 | } 36 | Files.createDirectories(targ.getParent()); 37 | try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), new Metadata())) { 38 | Files.copy(is, targ, StandardCopyOption.REPLACE_EXISTING); 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/digest/CSVLineCounter.java: -------------------------------------------------------------------------------- 1 | package org.tallison.digest; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | 7 | import org.apache.commons.csv.CSVFormat; 8 | import org.apache.commons.csv.CSVParser; 9 | import org.apache.commons.csv.CSVRecord; 10 | 11 | public class CSVLineCounter { 12 | 13 | public static void main(String[] args) throws Exception { 14 | Path path = Paths.get("/Users/allison/Desktop/size-pages-full.csv"); 15 | int c = 0; 16 | for (CSVRecord r : CSVParser.parse(path, StandardCharsets.UTF_8, CSVFormat.EXCEL)) { 17 | c++; 18 | } 19 | System.out.println(c); 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/digest/DigestChecker.java: -------------------------------------------------------------------------------- 1 | package org.tallison.digest; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.nio.charset.StandardCharsets; 8 | import java.nio.file.Files; 9 | import java.nio.file.Path; 10 | import java.nio.file.Paths; 11 | import java.util.concurrent.atomic.AtomicInteger; 12 | 13 | import org.apache.commons.codec.digest.DigestUtils; 14 | 15 | public class DigestChecker { 16 | 17 | AtomicInteger totalChecked = new AtomicInteger(0); 18 | public static void main(String[] args) throws Exception { 19 | Path dir = Paths.get(args[0]); 20 | try (BufferedWriter writer = 21 | Files.newBufferedWriter(Paths.get(args[1]), StandardCharsets.UTF_8)) { 22 | DigestChecker digestChecker = new DigestChecker(); 23 | digestChecker.execute(dir, writer); 24 | } 25 | } 26 | 27 | private void execute(Path rootDir, BufferedWriter writer) { 28 | processDir(rootDir, writer); 29 | System.err.println("completed successfully"); 30 | } 31 | 32 | private void processDir(Path path, BufferedWriter writer) { 33 | for (File f : path.toFile().listFiles()) { 34 | if (f.isFile()) { 35 | processFile(f, writer); 36 | } else { 37 | processDir(f.toPath(), writer); 38 | } 39 | } 40 | } 41 | 42 | private void processFile(File f, BufferedWriter writer) { 43 | String name = f.getName(); 44 | String digest = null; 45 | try (InputStream is = Files.newInputStream(f.toPath())) { 46 | digest = DigestUtils.sha256Hex(is); 47 | } catch (IOException e) { 48 | e.printStackTrace(); 49 | } 50 | if (! name.equals(digest)) { 51 | try { 52 | writer.write(name + "\t" + digest + "\n"); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | } 57 | int checked = totalChecked.incrementAndGet(); 58 | if (checked % 1000 == 0) { 59 | System.err.println(checked + " files processed"); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/digest/FileListNormalizer.java: -------------------------------------------------------------------------------- 1 | package org.tallison.digest; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.nio.charset.StandardCharsets; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | import java.nio.file.Paths; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | public class FileListNormalizer { 14 | 15 | public static void main(String[] args) throws Exception { 16 | Path dir = Paths.get("PATH"); 17 | for (File f : dir.toFile().listFiles()) { 18 | if (f.getName().endsWith("-normed.txt")) { 19 | continue; 20 | } 21 | Path output = dir.resolve(f.getName().replace(".txt", "-normed.txt")); 22 | try (BufferedWriter w = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) { 23 | try (BufferedReader r = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) { 24 | String line = r.readLine(); 25 | Matcher m = 26 | Pattern.compile("([a-f0-9]{2,2}/[a-f0-9]{2,2}/[a-f0-9]+)").matcher(""); 27 | while (line != null) { 28 | m.reset(line); 29 | if (m.find()) { 30 | System.out.println(m.group(1)); 31 | w.write(m.group(1) + "\n"); 32 | } else { 33 | System.err.println("wtf: "+line); 34 | } 35 | line = r.readLine(); 36 | } 37 | } 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/digest/S3ListCompare.java: -------------------------------------------------------------------------------- 1 | package org.tallison.digest; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.Writer; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | import java.nio.file.Paths; 8 | import java.util.HashSet; 9 | import java.util.Set; 10 | 11 | import com.amazonaws.auth.AWSCredentialsProvider; 12 | import com.amazonaws.auth.profile.ProfileCredentialsProvider; 13 | import com.amazonaws.services.s3.AmazonS3; 14 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 15 | import com.amazonaws.services.s3.iterable.S3Objects; 16 | import com.amazonaws.services.s3.model.S3ObjectSummary; 17 | 18 | public class S3ListCompare { 19 | public static void main(String[] args) throws Exception { 20 | Path pwd = Paths.get(""); 21 | Path oneMillion = pwd.resolve(""); 22 | Path s3 = pwd.resolve("s3-files.txt"); 23 | Set eval = load(oneMillion); 24 | Set s3list = load(s3); 25 | System.out.println(eval.size()); 26 | System.out.println(s3list.size()); 27 | int missing = 0; 28 | for (String k : eval) { 29 | if (! s3list.contains(k)) { 30 | System.out.println("file missing in s3: "+ k); 31 | missing++; 32 | } 33 | } 34 | 35 | System.out.println("missing: " + missing); 36 | } 37 | 38 | private static Set load(Path p) throws Exception { 39 | Set set = new HashSet<>(); 40 | try (BufferedReader r = Files.newBufferedReader(p)) { 41 | String line = r.readLine(); 42 | while (line != null) { 43 | String[] bits = line.split("\\s+"); 44 | String k = bits[0].trim(); 45 | k = k.replaceFirst("", ""); 46 | k = k.replaceFirst("", ""); 47 | k = k.trim(); 48 | set.add(k); 49 | line = r.readLine(); 50 | } 51 | } 52 | return set; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /utils-general/src/main/java/org/tallison/filter/CopyByMime.java: -------------------------------------------------------------------------------- 1 | package org.tallison.filter; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | import java.nio.file.Paths; 8 | 9 | import org.apache.tika.Tika; 10 | 11 | public class CopyByMime { 12 | 13 | public static void main(String[] args) { 14 | Path src = Paths.get(args[0]); 15 | Path target = Paths.get(args[1]); 16 | String mimePart = "nitf"; 17 | Tika tika = new Tika(); 18 | processDirectory(mimePart, src, src, target, tika); 19 | 20 | } 21 | 22 | private static void processDirectory(String mimePart, Path root, Path path, Path targetRoot, 23 | Tika tika) { 24 | for (File f : path.toFile().listFiles()) { 25 | if (f.isDirectory()) { 26 | processDirectory(mimePart, root, f.toPath(), targetRoot, tika); 27 | } else { 28 | processFile(mimePart, root, f.toPath(), targetRoot, tika); 29 | } 30 | } 31 | } 32 | 33 | private static void processFile(String mimePart, Path root, Path path, Path targetRoot, 34 | Tika tika) { 35 | 36 | try { 37 | String type = tika.detect(path); 38 | if (type.contains(mimePart)) { 39 | Path rel = root.relativize(path); 40 | Path target= targetRoot.resolve(rel); 41 | System.out.println(type + " : " + path); 42 | System.out.println(path + "-> " + target); 43 | if (!Files.isDirectory(target.getParent())) { 44 | Files.createDirectories(target.getParent()); 45 | } 46 | Files.copy(path, target); 47 | } 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /utils-general/src/test/java/org/tallison/pdf/utils/TestPDFSplitter.java: -------------------------------------------------------------------------------- 1 | package org.tallison.pdf.utils; 2 | 3 | 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | 7 | import org.junit.Ignore; 8 | import org.junit.Test; 9 | 10 | 11 | public class TestPDFSplitter { 12 | 13 | @Test 14 | @Ignore 15 | public void testSimple() throws Exception { 16 | 17 | PDFSplitter.main(new String[]{ 18 | "/docs", 19 | "/single-pages", 20 | "10"}); 21 | } 22 | } 23 | --------------------------------------------------------------------------------