├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── evaluate_indent.py ├── evaluate_numbering.py ├── evaluate_pdfminer.py ├── feature_importance.py ├── pdf_struct ├── __init__.py ├── _version.py ├── cli.py ├── core │ ├── __init__.py │ ├── clustering.py │ ├── data_statistics.py │ ├── document.py │ ├── download.py │ ├── evaluation.py │ ├── export.py │ ├── feature_extractor.py │ ├── predictor.py │ ├── preprocessing.py │ ├── structure_evaluation.py │ ├── transition_labels.py │ └── utils.py ├── export │ ├── __init__.py │ └── hocr.py ├── feature_extractor │ ├── __init__.py │ ├── hocr_balance_sheet_ja.py │ ├── pdf_contract.py │ ├── pdf_contract_ja.py │ └── text_contract.py ├── features │ ├── __init__.py │ ├── lexical.py │ ├── listing │ │ ├── __init__.py │ │ ├── base.py │ │ ├── en.py │ │ └── ja.py │ └── lm.py └── loader │ ├── __init__.py │ ├── hocr.py │ ├── pdf.py │ └── text.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/README.md -------------------------------------------------------------------------------- /evaluate_indent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/evaluate_indent.py -------------------------------------------------------------------------------- /evaluate_numbering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/evaluate_numbering.py -------------------------------------------------------------------------------- /evaluate_pdfminer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/evaluate_pdfminer.py -------------------------------------------------------------------------------- /feature_importance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/feature_importance.py -------------------------------------------------------------------------------- /pdf_struct/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/__init__.py -------------------------------------------------------------------------------- /pdf_struct/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.4" 2 | -------------------------------------------------------------------------------- /pdf_struct/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/cli.py -------------------------------------------------------------------------------- /pdf_struct/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/__init__.py -------------------------------------------------------------------------------- /pdf_struct/core/clustering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/clustering.py -------------------------------------------------------------------------------- /pdf_struct/core/data_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/data_statistics.py -------------------------------------------------------------------------------- /pdf_struct/core/document.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/document.py -------------------------------------------------------------------------------- /pdf_struct/core/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/download.py -------------------------------------------------------------------------------- /pdf_struct/core/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/evaluation.py -------------------------------------------------------------------------------- /pdf_struct/core/export.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/export.py -------------------------------------------------------------------------------- /pdf_struct/core/feature_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/feature_extractor.py -------------------------------------------------------------------------------- /pdf_struct/core/predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/predictor.py -------------------------------------------------------------------------------- /pdf_struct/core/preprocessing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/preprocessing.py -------------------------------------------------------------------------------- /pdf_struct/core/structure_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/structure_evaluation.py -------------------------------------------------------------------------------- /pdf_struct/core/transition_labels.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/transition_labels.py -------------------------------------------------------------------------------- /pdf_struct/core/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/core/utils.py -------------------------------------------------------------------------------- /pdf_struct/export/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/export/__init__.py -------------------------------------------------------------------------------- /pdf_struct/export/hocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/export/hocr.py -------------------------------------------------------------------------------- /pdf_struct/feature_extractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/feature_extractor/__init__.py -------------------------------------------------------------------------------- /pdf_struct/feature_extractor/hocr_balance_sheet_ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/feature_extractor/hocr_balance_sheet_ja.py -------------------------------------------------------------------------------- /pdf_struct/feature_extractor/pdf_contract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/feature_extractor/pdf_contract.py -------------------------------------------------------------------------------- /pdf_struct/feature_extractor/pdf_contract_ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/feature_extractor/pdf_contract_ja.py -------------------------------------------------------------------------------- /pdf_struct/feature_extractor/text_contract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/feature_extractor/text_contract.py -------------------------------------------------------------------------------- /pdf_struct/features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/__init__.py -------------------------------------------------------------------------------- /pdf_struct/features/lexical.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/lexical.py -------------------------------------------------------------------------------- /pdf_struct/features/listing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/listing/__init__.py -------------------------------------------------------------------------------- /pdf_struct/features/listing/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/listing/base.py -------------------------------------------------------------------------------- /pdf_struct/features/listing/en.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/listing/en.py -------------------------------------------------------------------------------- /pdf_struct/features/listing/ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/listing/ja.py -------------------------------------------------------------------------------- /pdf_struct/features/lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/features/lm.py -------------------------------------------------------------------------------- /pdf_struct/loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/loader/__init__.py -------------------------------------------------------------------------------- /pdf_struct/loader/hocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/loader/hocr.py -------------------------------------------------------------------------------- /pdf_struct/loader/pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/loader/pdf.py -------------------------------------------------------------------------------- /pdf_struct/loader/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/pdf_struct/loader/text.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/pdf-struct/HEAD/setup.py --------------------------------------------------------------------------------