├── data └── paleo │ ├── documents │ ├── 1-s2.0-S0009254102001183-main.pdf │ ├── 1-s2.0-S0012821X12005717-main.pdf │ ├── 1-s2.0-S0016699515000601-main.pdf │ ├── 1-s2.0-S0016703711007290-main.pdf │ ├── 1-s2.0-S0031018210006152-main.pdf │ ├── 1-s2.0-S1464343X10000865-main.pdf │ └── Zapata-Rios_et_al-2015-Water_Resources_Research.pdf │ └── ml │ ├── gt.test │ ├── gt.train │ ├── ml.bboxes │ ├── model.pkl │ ├── old.gt.test │ ├── old.pdf.test │ ├── test.pdf.list.paleo.not.scanned │ ├── test.pdf.list.paleo.not.scanned.bbox │ ├── test.pdf.list.paleo.not.scanned.candidates.pkl │ ├── test.pdf.list.paleo.not.scanned.features.pkl │ ├── test.pdf.list.paleo.not.scanned.labels.pkl │ ├── train.pdf.list.paleo.not.scanned │ ├── train.pdf.list.paleo.not.scanned.candidates.pkl │ ├── train.pdf.list.paleo.not.scanned.features.pkl │ ├── train.pdf.list.paleo.not.scanned.labels.pkl │ └── y_pred.pkl ├── readme.md ├── requirements.txt ├── set_env.sh └── table-extraction ├── LICENSE ├── __init__.py ├── evaluation ├── __init__.py └── char_level_evaluation.py ├── experiment.py ├── img_utils.py ├── ml ├── TableExtractML.py ├── __init__.py ├── extract_tables.py └── features.py ├── pdf ├── __init__.py ├── grid.py ├── layout_utils.py ├── node.py ├── pdf_parsers.py ├── pdf_utils.py ├── render.py └── vector_utils.py ├── pdfminer ├── __init__.py ├── arcfour.py ├── ascii85.py ├── ccitt.py ├── cmapdb.py ├── converter.py ├── encodingdb.py ├── fontmetrics.py ├── glyphlist.py ├── image.py ├── latin_enc.py ├── layout.py ├── lzw.py ├── pdfcolor.py ├── pdfdevice.py ├── pdfdocument.py ├── pdffont.py ├── pdfinterp.py ├── pdfpage.py ├── pdfparser.py ├── pdftypes.py ├── psparser.py ├── rijndael.py ├── runlength.py └── utils.py ├── tutorials └── table-extraction-demo.ipynb └── utils ├── __init__.py ├── bbox_utils.py ├── display_utils.py └── lines_utils.py /data/paleo/documents/1-s2.0-S0009254102001183-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S0009254102001183-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/1-s2.0-S0012821X12005717-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S0012821X12005717-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/1-s2.0-S0016699515000601-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S0016699515000601-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/1-s2.0-S0016703711007290-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S0016703711007290-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/1-s2.0-S0031018210006152-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S0031018210006152-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/1-s2.0-S1464343X10000865-main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/1-s2.0-S1464343X10000865-main.pdf -------------------------------------------------------------------------------- /data/paleo/documents/Zapata-Rios_et_al-2015-Water_Resources_Research.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/documents/Zapata-Rios_et_al-2015-Water_Resources_Research.pdf -------------------------------------------------------------------------------- /data/paleo/ml/gt.test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/gt.test -------------------------------------------------------------------------------- /data/paleo/ml/gt.train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/gt.train -------------------------------------------------------------------------------- /data/paleo/ml/ml.bboxes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/ml.bboxes -------------------------------------------------------------------------------- /data/paleo/ml/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/model.pkl -------------------------------------------------------------------------------- /data/paleo/ml/old.gt.test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/old.gt.test -------------------------------------------------------------------------------- /data/paleo/ml/old.pdf.test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/old.pdf.test -------------------------------------------------------------------------------- /data/paleo/ml/test.pdf.list.paleo.not.scanned: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/test.pdf.list.paleo.not.scanned -------------------------------------------------------------------------------- /data/paleo/ml/test.pdf.list.paleo.not.scanned.bbox: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/test.pdf.list.paleo.not.scanned.bbox -------------------------------------------------------------------------------- /data/paleo/ml/test.pdf.list.paleo.not.scanned.candidates.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/test.pdf.list.paleo.not.scanned.candidates.pkl -------------------------------------------------------------------------------- /data/paleo/ml/test.pdf.list.paleo.not.scanned.features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/test.pdf.list.paleo.not.scanned.features.pkl -------------------------------------------------------------------------------- /data/paleo/ml/test.pdf.list.paleo.not.scanned.labels.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/test.pdf.list.paleo.not.scanned.labels.pkl -------------------------------------------------------------------------------- /data/paleo/ml/train.pdf.list.paleo.not.scanned: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/train.pdf.list.paleo.not.scanned -------------------------------------------------------------------------------- /data/paleo/ml/train.pdf.list.paleo.not.scanned.candidates.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/train.pdf.list.paleo.not.scanned.candidates.pkl -------------------------------------------------------------------------------- /data/paleo/ml/train.pdf.list.paleo.not.scanned.features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/train.pdf.list.paleo.not.scanned.features.pkl -------------------------------------------------------------------------------- /data/paleo/ml/train.pdf.list.paleo.not.scanned.labels.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/train.pdf.list.paleo.not.scanned.labels.pkl -------------------------------------------------------------------------------- /data/paleo/ml/y_pred.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/data/paleo/ml/y_pred.pkl -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/readme.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/requirements.txt -------------------------------------------------------------------------------- /set_env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/set_env.sh -------------------------------------------------------------------------------- /table-extraction/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/LICENSE -------------------------------------------------------------------------------- /table-extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /table-extraction/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /table-extraction/evaluation/char_level_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/evaluation/char_level_evaluation.py -------------------------------------------------------------------------------- /table-extraction/experiment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/experiment.py -------------------------------------------------------------------------------- /table-extraction/img_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/img_utils.py -------------------------------------------------------------------------------- /table-extraction/ml/TableExtractML.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/ml/TableExtractML.py -------------------------------------------------------------------------------- /table-extraction/ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /table-extraction/ml/extract_tables.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/ml/extract_tables.py -------------------------------------------------------------------------------- /table-extraction/ml/features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/ml/features.py -------------------------------------------------------------------------------- /table-extraction/pdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /table-extraction/pdf/grid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/grid.py -------------------------------------------------------------------------------- /table-extraction/pdf/layout_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/layout_utils.py -------------------------------------------------------------------------------- /table-extraction/pdf/node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/node.py -------------------------------------------------------------------------------- /table-extraction/pdf/pdf_parsers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/pdf_parsers.py -------------------------------------------------------------------------------- /table-extraction/pdf/pdf_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/pdf_utils.py -------------------------------------------------------------------------------- /table-extraction/pdf/render.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/render.py -------------------------------------------------------------------------------- /table-extraction/pdf/vector_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdf/vector_utils.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/__init__.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/arcfour.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/arcfour.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/ascii85.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/ascii85.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/ccitt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/ccitt.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/cmapdb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/cmapdb.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/converter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/converter.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/encodingdb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/encodingdb.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/fontmetrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/fontmetrics.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/glyphlist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/glyphlist.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/image.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/latin_enc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/latin_enc.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/layout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/layout.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/lzw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/lzw.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfcolor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfcolor.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfdevice.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfdevice.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfdocument.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfdocument.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdffont.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdffont.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfinterp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfinterp.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfpage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfpage.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdfparser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdfparser.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/pdftypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/pdftypes.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/psparser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/psparser.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/rijndael.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/rijndael.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/runlength.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/runlength.py -------------------------------------------------------------------------------- /table-extraction/pdfminer/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/pdfminer/utils.py -------------------------------------------------------------------------------- /table-extraction/tutorials/table-extraction-demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/tutorials/table-extraction-demo.ipynb -------------------------------------------------------------------------------- /table-extraction/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /table-extraction/utils/bbox_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/utils/bbox_utils.py -------------------------------------------------------------------------------- /table-extraction/utils/display_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/utils/display_utils.py -------------------------------------------------------------------------------- /table-extraction/utils/lines_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HazyResearch/TreeStructure/HEAD/table-extraction/utils/lines_utils.py --------------------------------------------------------------------------------