├── .circleci └── config.yml ├── .gitignore ├── .scalafmt.conf ├── LICENSE.txt ├── README.md ├── evaluation ├── .gitignore ├── README.md ├── __init__.py ├── build_evaluation.py ├── build_section_eval.py ├── compare_evaluation.py ├── datasets │ ├── README.md │ ├── __init__.py │ ├── build_dataset_images.py │ ├── conference │ │ ├── annotations.json │ │ └── doc_ids.txt │ ├── datasets.py │ ├── non_standard_documents.txt │ ├── s2 │ │ ├── annotations.json │ │ ├── doc_ids.txt │ │ ├── non_standard_documents.txt │ │ └── pages_annotated.json │ ├── section-annotations.json │ ├── test_datasets.py │ └── visualize_annotations.py ├── download_from_urls.py ├── extractors.py ├── parse_evaluation.py ├── pdffigures_utils.py ├── print_dataset_stats.py ├── section_extractors.py └── time_extractor.py ├── project ├── build.properties └── plugins.sbt └── src ├── main ├── java │ └── org │ │ └── allenai │ │ └── pdffigures2 │ │ └── RegexWithTimeout.java ├── resources │ └── logback.xml └── scala │ └── org │ └── allenai │ └── pdffigures2 │ ├── Box.scala │ ├── CaptionBuilder.scala │ ├── CaptionDetector.scala │ ├── DocumentLayout.scala │ ├── Figure.scala │ ├── FigureDetector.scala │ ├── FigureExtractor.scala │ ├── FigureExtractorBatchCli.scala │ ├── FigureExtractorVisualizationCli.scala │ ├── FigureRenderer.scala │ ├── FindGraphicsRaster.scala │ ├── FormattingTextExtractor.scala │ ├── GraphicBBDetector.scala │ ├── GraphicsExtractor.scala │ ├── InterruptiblePDFRenderer.scala │ ├── JsonProtocol.scala │ ├── Logging.scala │ ├── PageStructure.scala │ ├── Paragraph.scala │ ├── ParagraphRebuilder.scala │ ├── RegionClassifier.scala │ ├── SectionTitleExtractor.scala │ ├── SectionedTextBuilder.scala │ ├── TextExtractor.scala │ └── VisualLogger.scala └── test ├── resources └── test-pdfs │ ├── 3a9202f9f176d3377516e3da0866cc19148c033b.pdf │ ├── 498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf │ └── f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf └── scala └── org └── allenai └── pdffigures2 └── TestExtractionFilters.scala /.circleci/config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/.circleci/config.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/.gitignore -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/.scalafmt.conf -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/README.md -------------------------------------------------------------------------------- /evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | grobid* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/README.md -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/build_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/build_evaluation.py -------------------------------------------------------------------------------- /evaluation/build_section_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/build_section_eval.py -------------------------------------------------------------------------------- /evaluation/compare_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/compare_evaluation.py -------------------------------------------------------------------------------- /evaluation/datasets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/README.md -------------------------------------------------------------------------------- /evaluation/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/datasets/build_dataset_images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/build_dataset_images.py -------------------------------------------------------------------------------- /evaluation/datasets/conference/annotations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/conference/annotations.json -------------------------------------------------------------------------------- /evaluation/datasets/conference/doc_ids.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/conference/doc_ids.txt -------------------------------------------------------------------------------- /evaluation/datasets/datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/datasets.py -------------------------------------------------------------------------------- /evaluation/datasets/non_standard_documents.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/datasets/s2/annotations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/s2/annotations.json -------------------------------------------------------------------------------- /evaluation/datasets/s2/doc_ids.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/s2/doc_ids.txt -------------------------------------------------------------------------------- /evaluation/datasets/s2/non_standard_documents.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/s2/non_standard_documents.txt -------------------------------------------------------------------------------- /evaluation/datasets/s2/pages_annotated.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/s2/pages_annotated.json -------------------------------------------------------------------------------- /evaluation/datasets/section-annotations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/section-annotations.json -------------------------------------------------------------------------------- /evaluation/datasets/test_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/test_datasets.py -------------------------------------------------------------------------------- /evaluation/datasets/visualize_annotations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/datasets/visualize_annotations.py -------------------------------------------------------------------------------- /evaluation/download_from_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/download_from_urls.py -------------------------------------------------------------------------------- /evaluation/extractors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/extractors.py -------------------------------------------------------------------------------- /evaluation/parse_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/parse_evaluation.py -------------------------------------------------------------------------------- /evaluation/pdffigures_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/pdffigures_utils.py -------------------------------------------------------------------------------- /evaluation/print_dataset_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/print_dataset_stats.py -------------------------------------------------------------------------------- /evaluation/section_extractors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/section_extractors.py -------------------------------------------------------------------------------- /evaluation/time_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/evaluation/time_extractor.py -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.7.1 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/project/plugins.sbt -------------------------------------------------------------------------------- /src/main/java/org/allenai/pdffigures2/RegexWithTimeout.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/java/org/allenai/pdffigures2/RegexWithTimeout.java -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/resources/logback.xml -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/Box.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/Box.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/CaptionBuilder.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/CaptionBuilder.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/Figure.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/Figure.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FigureDetector.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/Logging.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/Logging.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/PageStructure.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/PageStructure.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/Paragraph.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/Paragraph.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/ParagraphRebuilder.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/ParagraphRebuilder.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/RegionClassifier.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/RegionClassifier.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/TextExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/org/allenai/pdffigures2/VisualLogger.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala -------------------------------------------------------------------------------- /src/test/resources/test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/test/resources/test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf -------------------------------------------------------------------------------- /src/test/resources/test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/test/resources/test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf -------------------------------------------------------------------------------- /src/test/resources/test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/test/resources/test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf -------------------------------------------------------------------------------- /src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/pdffigures2/HEAD/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala --------------------------------------------------------------------------------