├── .dockerignore ├── .flake8 ├── .github └── workflows │ ├── docker-image.yml │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── config-defaults.yaml ├── data ├── 2012_manifest.tsv ├── 2014_manifest.tsv ├── 2020_manifest.csv ├── 3_year_manifest.csv ├── create-training-data.py ├── fcc-data-2020-labeled-manifest.csv └── token_frequency.csv ├── deepform ├── __init__.py ├── artifacts.py ├── combine_manifests.py ├── common.py ├── data │ ├── __init__.py │ ├── add_features.py │ ├── create_vocabulary.py │ ├── graph_geometry.py │ └── tokenize_pdfs.py ├── db │ ├── .env │ ├── README.md │ ├── __init__.py │ ├── conf │ │ └── config-file.cnf │ ├── scripts │ │ ├── create_schema.sql │ │ ├── load_document_data.sql │ │ └── load_token_data.sql │ └── source.py ├── document.py ├── document_store.py ├── features.py ├── infer.py ├── logger.py ├── model.py ├── pdfs.py ├── train.py └── util.py ├── init_sweep.sh ├── poetry.lock ├── pyproject.toml ├── source ├── README.md └── ftf-all-filings.tsv ├── sweep.yaml └── tests ├── test_add_features.py ├── test_graph_geometry.py └── test_util.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Ignore everything, only make exceptions for the files we know we want. 2 | * 3 | 4 | # Whitelisted exceptions. 5 | !pyproject.toml 6 | !poetry.lock 7 | !deepform/ 8 | !tests/ 9 | !*.yaml 10 | !init_sweep.sh 11 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = D203,E203,W503 3 | exclude = __pycache__,.hypothesis,.ipynb_checkpoints,wandb,old,docs/source/conf.py,old 4 | max-line-length = 88 5 | max-complexity = 10 6 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Build the Docker image 18 | run: docker build . --file Dockerfile --tag deepform_learner:$(date +%s) 19 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: [push] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.8 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.8 18 | 19 | - name: Install lint tools 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install black flake8 23 | 24 | - name: Check formatting with black 25 | run: | 26 | black . --check 27 | 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | 35 | - name: Install project 36 | run: | 37 | pip install poetry==1.0.10 38 | poetry config virtualenvs.create false 39 | poetry install --no-interaction --no-ansi 40 | 41 | - name: Test with pytest 42 | run: | 43 | pytest 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs/ 3 | wandb/ 4 | 5 | # Caches 6 | cache/ 7 | __pycache__ 8 | *.egg-info 9 | source/cached_features.p 10 | .ipynb_checkpoints 11 | .dvc/ 12 | .hypothesis/ 13 | *.joblib 14 | 15 | # Local environment files (for e.g. API keys) 16 | .env 17 | *.pem 18 | 19 | # Personal (e.g. editor) configuration 20 | .vscode/ 21 | 22 | # Data files 23 | *.csv 24 | *.feather 25 | *.gz 26 | *.model 27 | *.parquet 28 | *.pdf 29 | *.png 30 | *.pq 31 | *.npz 32 | data/token_frequency.csv 33 | pdfs/ 34 | 35 | # macOS system files 36 | .DS_Store 37 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/myint/autoflake 3 | rev: v1.4 4 | hooks: 5 | - id: autoflake 6 | args: 7 | [ 8 | "--in-place", 9 | "--remove-all-unused-imports", 10 | "--remove-unused-variable", 11 | ] 12 | - repo: https://github.com/pycqa/isort 13 | rev: 5.5.4 14 | hooks: 15 | - id: isort 16 | additional_dependencies: ["toml"] 17 | - repo: https://github.com/ambv/black 18 | rev: 20.8b1 19 | hooks: 20 | - id: black 21 | language_version: python3.8 22 | - repo: https://github.com/PyCQA/flake8 23 | rev: 3.8.3 24 | hooks: 25 | - id: flake8 26 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.6 2 | 3 | ENV PYTHONFAULTHANDLER=1 \ 4 | PYTHONHASHSEED=random \ 5 | PYTHONUNBUFFERED=1 \ 6 | PIP_DEFAULT_TIMEOUT=100 \ 7 | PIP_DISABLE_PIP_VERSION_CHECK=1 \ 8 | PIP_NO_CACHE_DIR=1 9 | 10 | # Install dependencies for pdfplumber. 11 | RUN apt-get update && apt-get install -y \ 12 | libmagickwand-dev ghostscript \ 13 | --no-install-recommends 14 | 15 | # Allow imagemagick to read and write PDFs. 16 | RUN sed -i 's///' \ 17 | /etc/ImageMagick-6/policy.xml 18 | 19 | # Get this out of the way early, because it takes so damn long -- we really want to cache it. 20 | RUN pip install "tensorflow==2.3.1" 21 | 22 | # Install Poetry and project dependencies. 23 | RUN pip install "poetry==1.1.0" 24 | RUN poetry config virtualenvs.create false 25 | COPY pyproject.toml poetry.lock ./ 26 | RUN poetry install --no-root 27 | 28 | # Install an editable copy of the project. 29 | COPY . . 30 | RUN poetry install --no-interaction --no-ansi 31 | 32 | CMD ["/bin/bash"] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 project-deepform 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TEST_PATH=$(CURDIR)/tests 2 | CONTAINER=deepform/deepform_learner:latest 3 | 4 | .DEFAULT_GOAL := help 5 | 6 | .PHONY: help 7 | help: ## Show this help dialog 8 | @grep -E '^[a-zA-Z/\._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 9 | 10 | .PHONY: test 11 | test: docker-build ## Run all the unit tests for the project 12 | docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 13 | pytest --verbose --color=yes tests 14 | 15 | .PHONY: clean-all 16 | clean-all: 17 | -rm -r data/cache data/labeled data/tokenized data/training 18 | -rm data/training.parquet data/doc_index.parquet 19 | 20 | .PHONY: docker-build 21 | docker-build: ## Build the docker container 22 | docker build -t $(CONTAINER) . 23 | 24 | .PHONY: docker-stop 25 | docker-stop: ## Stop any running instances of the deepform docker container on this system 26 | -docker ps | grep $(CONTAINER) | cut -d' ' -f1 | xargs docker stop 27 | 28 | .PHONY: docker-shell 29 | docker-shell: docker-stop docker-build ## Launch a shell into a docker container containing the required dependencies and data 30 | docker run -ti --rm --env-file=.env \ 31 | --mount type=bind,source=$(CURDIR)/deepform,target=/deepform \ 32 | --mount type=bind,source=$(CURDIR)/data,target=/data \ 33 | $(CONTAINER) 34 | 35 | .PHONY: docker-background 36 | docker-background: docker-stop docker-build ## Launch a docker container as a background process. 37 | docker run -td --rm --env-file=.env \ 38 | --mount type=bind,source=$(CURDIR)/deepform,target=/deepform \ 39 | --mount type=bind,source=$(CURDIR)/data,target=/data \ 40 | $(CONTAINER) 41 | 42 | # This was used by a previous version of our codebase. 43 | # data/training.parquet: 44 | # curl https://project-deepform.s3-us-west-1.amazonaws.com/training_data/training.parquet -o data/training.parquet 45 | 46 | data/pdfs: data/2020_manifest.csv ## Downloads all PDFs to local storage. Not usually necessary. 47 | docker build -t $(CONTAINER) . 48 | docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) python -c "import pandas as pd; print('\n'.join(pd.read_csv('data/fcc-data-2020-labeled-manifest.csv').URL))" | xargs wget -P data/pdfs 49 | 50 | # This is the command we used to produce the tokenized data, but it is cached in S3 51 | # data/tokenized: data/pdfs 52 | # docker build -t $(CONTAINER) . 53 | # docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) python -m deepform.data.tokenize_pdfs 54 | 55 | data/tokenized: ## Get document tokens from S3 56 | curl https://project-deepform.s3-us-west-1.amazonaws.com/training_data/token_data.tar.gz -o data/tokenized.tar.gz 57 | mkdir -p data/tokenized 58 | tar xf data/tokenized.tar.gz -C data/tokenized 59 | 60 | data/token_frequency.csv: data/tokenized ## Produce token frequency csv file 61 | docker build -t $(CONTAINER) . 62 | docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 63 | python -m deepform.data.create_vocabulary 64 | 65 | data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data 66 | docker build -t $(CONTAINER) . 67 | docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 68 | python -m deepform.data.combine_manifests 69 | 70 | data/doc_index.parquet: data/tokenized data/token_frequency.csv data/3_year_manifest.csv ## Create the training data from the token files and label manifest 71 | docker build -t $(CONTAINER) . 72 | docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 73 | python -m deepform.data.add_features data/3_year_manifest.csv 74 | 75 | .PHONY: train 76 | train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training 77 | docker run --rm --env-file=.env \ 78 | --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 79 | python -um deepform.train 80 | 81 | .PHONY: test-train 82 | test-train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run training on a small sample to test and validate code 83 | docker run --rm --env-file=.env \ 84 | --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 85 | python -um deepform.train --len-train=100 --steps-per-epoch=3 --epochs=2 --log-level=DEBUG --use-wandb=0 --use-data-cache=0 --save-model=0 --doc-acc-max-sample-size=20 --render-results-size=3 86 | 87 | .PHONY: sweep 88 | sweep: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run a weights and biases training sweep. 89 | docker run --rm --env-file=.env \ 90 | --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 91 | ./init_sweep.sh 92 | 93 | VERSION='stable' 94 | download-model: ## Download a model for use with the inference script 95 | docker run --rm --env-file=.env \ 96 | --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \ 97 | python -m deepform.artifacts --version $(VERSION) 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deepform 2 | 3 | ![Python build](https://github.com/project-deepform/deepform/workflows/Python%20application/badge.svg) 4 | ![Docker image](https://github.com/project-deepform/deepform/workflows/Docker%20Image%20CI/badge.svg) 5 | 6 | Deepform is a project to extract information from TV and cable political advertising disclosure forms using deep learning. [This public data, maintained by the FCC](https://publicfiles.fcc.gov/), is valuable to journalists but locked in PDFs. Our goal is to provide the 2020 dataset for NLP/AI researchers and to make our method available to future data scientists working in this field. Past projects have managed to produce similar data sets only with great manual effort or by addressing only the most common form types, ignoring the tail of hundreds of rare form types. This work uses deep learning models that are able to generalize over form types and "learn" how to find five fields: 7 | 8 | - Contract number (multiple documents can have the same number as a contract for future air dates is revised) 9 | - Advertiser name (offen the name of a political [comittee](https://www.fec.gov/data/browse-data/?tab=committees) but not always) 10 | - Start and end air dates dates (often known as "flight dates") 11 | - Total amount paid for the ads 12 | 13 | The [initial attempt to use deep learning for this work](https://github.com/jstray/deepform), in summer 2019 by Jonathan Stray achieved 90% accuracy extracting total spending from the PDFs in the (held out) test set, which shows that deep learning can generalize surprisingly well to previously unseen form types. For a discussion of how the 2019 prototype works, see [this post](http://jonathanstray.com/extracting-campaign-finance-data-from-gnarly-pdfs-using-deep-learning). 14 | 15 | ## Why? 16 | 17 | This project is timely and relevant for a variety of reasons, some of them pertaining to this particular dataset and others to the method we are following. 18 | 19 | Election trasnsparency is an increasingly important component of the US electoral process and making this data available to journalists at low or no cost is key to that transparency. As the data is archived in tens of thousands of non-machine-readable PDF files in hundreds of different formats, it is beyond the capacity of journalistic entities to extract it by hand in a useful way. The data is available for purchase from private entities but we interviewed journalists who mentioned that the data comes with a price tag of $100K or more _per newspaper_ which wishes to use it. 20 | 21 | Past projects have used [volunteer labor](https://www.niemanlab.org/2012/12/crowdsourcing-campaign-spending-what-propublica-learned-from-free-the-files/) or [hand-coded form layouts](https://github.com/alexbyrnes/FCC-Political-Ads) to produce usable datasets. Project Deepform replicates this data extraction using modern deep learning techniques. This is desirable because we are not only positioned to produce a usable dataset in the context of the 2020 election but the method will be available to our team and other data science teams in the run up to all future US elections to produce similar datasets in the future. 22 | 23 | For our own purposes as members of the investigative data science community, Project Deepform functions as an open source springboard for future form extraction projects. Projects of this kind are becoming widely popular as the tools have improved within the past half decade to make this work possible. The general problem is known as "knowledge base construction" in the research community, and the current state of the art is achieved by multimodal systems such as [Fonduer](https://fonduer.readthedocs.io/en/latest/). A group at Google released [a paper](https://research.google/pubs/pub49122/) earlier in 2020 which describes a related process, Google also supports [Document Cloud AI](https://levelup.gitconnected.com/how-to-parse-forms-using-google-cloud-document-ai-68ad47e1c0ed) and others have made progress using [graph convolutional networks](https://link.springer.com/chapter/10.1007/978-3-030-21074-8_12). 24 | 25 | Finally, we have prepared this project dataset and its goals as a [benchmark project on Weights and Biases](https://wandb.ai/deepform/political-ad-extraction/benchmark). Here, other data scientists are encouraged to improve on the baseline success rates we have attained. 26 | 27 | 28 | ## Setting up the Environment 29 | 30 | The project is primarily intended to be run with [Docker](https://www.docker.com/products/docker-desktop), which eases issues with Python virtual environments, but it can also be run locally -- this is easiest to do with [Poetry](https://python-poetry.org/). 31 | 32 | ### Docker 33 | 34 | To use Docker, you'll have to be running the daemon, which you can find and install from https://www.docker.com/products/docker-desktop. Fortunately, that's _all_ you need. 35 | 36 | The project has a `Makefile` that covers most of the things you might want to do with the project. To get started, simply run 37 | 38 | `make train` 39 | 40 | or see below for other commands. 41 | 42 | 43 | ### Poetry - dependency management and running locally 44 | 45 | Deepform manages its dependencies with `Poetry`, which you only need if you want to run it locally or alter the project dependencies. You can install Poetry using any of the methods listed in their [documentation](https://python-poetry.org/docs/#installation). 46 | 47 | If you want to run Deepform locally: 48 | 49 | - run `poetry install` to install the deepform package and all of it's dependencies into a fresh virtual environment 50 | - enter this environment with `poetry shell` 51 | - or run a one-off command with `poetry run ` 52 | 53 | Since deepform is an installed package inside the virtual environment Poetry creates, run the code as modules, e.g. `python -m deepform.train` instead of `python deepform/train.py` -- this insures that imports and relative paths work the way they should. 54 | 55 | To update project dependencies: 56 | 57 | - `poetry add ` adds a new python package as a requirement 58 | - `poetry remove ` removes a package that's no longer needed 59 | - `poetry update` updates all the dependencies to their latest non-conflicting versions 60 | 61 | These three commands alter `pyproject.toml` and `poetry.lock`, which should be committed to git. Using them ensures that our project has reproducible builds. 62 | 63 | 64 | ## Training Data 65 | ### Getting the Training Data 66 | 67 | Running `make train` will acquire all the data you need _and_ will train the model. The total training data for this project consists of three label manifests (discussed below in detail) and 20,000 .parquet files containing the tokens and geometry from the PDFs used in training. Running `make train` will automatically run, in sequence, a series of commands which acquire, restructure and label the training data. These commands can alternatively be run manually, in sequence. 68 | 69 | 1. `make data/tokenized` downloads _all_ the unlabeled .parquet files (training and test) from an S3 bucket to the folder data/tokenized. 70 | 71 | 2. `make data/token_frequency.csv` constructs a vocabulary of tokens from all these .parquet files. 72 | 73 | 3. `make data/3_year_manifest.csv` combines three label manifests from three different election years (2012, 2014 and 2020) into a single manifest (`data/3_year_manifest.csv`) and includes a column 'year' to differentiate between the three years' data. 74 | 75 | 4. `make data/doc_index.parquet` will utilize the unlabeled .parquet files in the folder data/tokenized along with 3_year_manifest.csv (already in the repo) to generate a new set of _labeled_ .parquet files in the folder data/training containing the token and geometry along with a new columns for each of the five target types. This column is used to store the match percentage (for each token) between that token and the target in question. This script also computes other relevant features such as whether the token is a date or a dollar amount which are fed into the model as additional features. Some targets are more than one token in length so in these cases, this new column contains the likelihood that each token is a member of the target token string. 76 | 77 | These multi-token matching process receives a value for the maximum number of tokens (n) which might match the target ("Obama For America" being 3 tokens long while "1/12/2020" is one token long.) Due to OCR errors, some dates and dollar amounts are more than one token in length. We then calculate a match percentage for all strings of tokens of lengths (n, n-1, ... , 1). The highest match is achieved when the number of tokens is correct and the tokens match the target from the label manifest. Finally, since each token will participate in many match attempts, each token is assigned a match percentage which corresponds to the highest match it participated in. This table shows how "Obama for America" might be found. 78 | 79 | ``` 80 | ... 81 | token, n=1, n=2, n=3, n=4, n=5, ... 82 | contract,.1,.2,.2,.2,.1,... 83 | obama,.7,.6,.5,.4,.3,... 84 | $45,000,.03,.6,.5,.3,.65,... 85 | committee,.1,.6,.4,.75,.65,... 86 | obama,.7,.8,1.0,.75,.65,... 87 | for,.5,.8,1.0,.75,.65,... 88 | america,.67,.81,1.0,.75,.65,... 89 | 11/23/12,.03,.4,.4,.5,.6,... 90 | 11/29/12,.03,.03,.2,.3,.2,.6,... 91 | ... 92 | ``` 93 | 94 | ### Form of the training data 95 | All the data (training and test) for this project was originally raw PDFs, downloaable from the [FCC website](https://publicfiles.fcc.gov/) with up to 100,000 PDFs per election year. The _training_ data consists of some 20,000 of these PDFs, drawn from three different election years (2012, 2014 and 2020) according to available labels (see below), and three label manifests. 96 | 97 | The orignal PDFs were OCRd, tokenized, and turned into .parquet files, one for each PDF. The .parquet files are each named with the document slug and contain all of that document's tokens and their geometry on the page. Geometry is given in 1/100ths of an inch. 98 | 99 | The .parquet files are formatted as "tokens plus geometry" like this: 100 | 101 | `473630-116252-0-13442821773323-_-pdf.parquet` contains 102 | 103 | ``` 104 | page,x0,y0,x1,y1,token 105 | 0,272.613,438.395,301.525,438.439,$275.00 106 | 0,410.146,455.811,437.376,455.865,Totals 107 | 0,525.84,454.145,530.288,454.189,6 108 | 0,556.892,454.145,592.476,454.189,"$1,170.00" 109 | 0,18.0,480.478,37.998,480.527,Time 110 | 0,40.5,480.478,66.51,480.527,Period 111 | ... 112 | ``` 113 | 114 | The document name (the `slug`) is a unique document identifier, ultimately from the source TSV. The page number runs from 0 to 1, and the bounding box is in the original PDF coordinate system. The actual token text is reproduced as `token`. 115 | 116 | These .parquet files still lack labels however. Lables are provided in three "label manifests" for these three election years (2012, 2014 and 2020), each of which is a .csv or .tsv containing a column of file IDs (called slugs) and columns containing labels for each of the fields of interest for each document. Each year has a slighty different set of extracted fields, sometimes including additional extracted fields not used by the model in this repo. All three manifests are combined in data/3_year_manifest.csv. All three label manifests and the combined manifest are available in the `data` folder. If they are not present they can be recovered from various sources as detailed below. 117 | 118 | Using the labels in 3_year_manifest.csv and the 20,000 unlabeled token files, labeled token files are produced in the folder `data/training` which have the following form. These are the training data as provided to the model. 119 | 120 | ``` 121 | page x0 y0 x1 y1 token contract_num advertiser flight_from flight_to gross_amount tok_id length digitness is_dollar log_amount label 122 | 0 18 17.963 48.232 26.899 Contract 0 0.27 0 0 0 53 8 0 0 0 0 123 | 0 50.456 17.963 89.584 26.899 Agreement 0 0.33 0 0 0 115 9 0 0 0 0 124 | 0 474.001 17.963 505.137 26.899 1/15/20 0.4 0.26 0.38 0.88 0.22 0 8 0.75 0 0 0 125 | 0 414.781 65.213 445.917 74.149 1475302 1 0.26 0.4 0.27 0.67 0 7 1 1 14.204374 1 126 | 0 495.842 65.213 550.978 74.149 WOC12348242 0.33 0.26 0.32 0.32 0.19 663 11 0.72727275 0 0 0 127 | 0 183.909 90.193 298.949 101.363 www.gray.tv/advertising 0 0.58 0.06 0.06 0.06 1796 23 0 0 0 0 128 | 0 309.002 90.923 326.786 99.859 Mike 0 1 0 0 0 664 4 0 0 0 2 129 | 0 329.01 90.923 371.234 99.859 Bloomberg 0 1 0 0 0 821 9 0 0 0 2 130 | 0 373.458 90.923 393.474 99.859 2020, 0.33 1 0.31 0.46 0.67 0 5 0.8 0 0 2 131 | 0 395.698 90.923 407.258 99.859 Inc 0 1 0 0 0 166 3 0 0 0 2 132 | 0 491.041 90.683 522.177 99.619 12/31/19 0.27 0.74 0.88 0.5 0.22 0 8 0.75 0 0 0 133 | 0 308.251 103.463 338.483 112.399 Contract 0 0.24 0 0 0 53 8 0 0 0 0 134 | 0 340.707 103.463 361.603 112.399 Dates 0 0.23 0 0 0 18 5 0 0 0 0 135 | 0 407.251 103.463 438.371 112.399 Estimate 0 0.26 0 0 0 23 8 0 0 0 0 136 | 0 308.251 115.703 339.387 124.639 12/30/19 0.4 0.26 1 0.5 0.33 0 8 0.75 0 0 3 137 | 0 346.499 115.703 377.635 124.639 1/12/20 0.27 0.21 0.5 1 0.22 0 8 0.75 0 0 4 138 | ... 139 | ``` 140 | 141 | N.B. As it is written currently, the model only trains on the one thousand documents of 2020 data. 142 | 143 | ### Where the labels come from 144 | #### 2012 Label Manifest 145 | In 2012, ProPublica ran the Free The Files project (you can [read how it worked](https://www.niemanlab.org/2012/12/crowdsourcing-campaign-spending-what-propublica-learned-from-free-the-files/)) and hundreds of volunteers hand-entered information for over 17,000 of these forms. That data drove a bunch of campaign finance [coverage](https://www.propublica.org/series/free-the-files) and is now [available](https://www.propublica.org/datastore/dataset/free-the-files-filing-data) from their data store. 146 | 147 | The label manifest for 2012 data was downloaded from Pro Publica and is located at `data/2012_manifest.tsv` (renamed from ftf-all-filings.tsv which is the filename it downloads as). If the manifest is not present, it can be recovered from [their website](https://www.propublica.org/datastore/dataset/free-the-files-filing-data). This file contains the crowdsourced answers for some of our targets (omitting flight dates) and the PDF url. 148 | 149 | #### 2014 Label Manifest 150 | In 2014 Alex Byrnes [automated](https://github.com/alexbyrnes/FCC-Political-Ads) a similar extraction by hand-coding form layouts. This works for the dozen or so most common form types but ignores the hundreds of different PDF layouts in the long tail. 151 | 152 | The label manifest for 2014 data, acquired from Alex's Github is `data/2014_manifest.tsv`. If the manifest is not present, it can be recovered from [his github](https://github.com/alexbyrnes/FCC-Political-Ads) (renamed from 2014-orders.tsv which is the filename it downloads as). This file contains the crowdsourced answers for some of our targets (omitting 'gross amount'). 153 | 154 | 155 | #### 2020 Label Manifest 156 | 157 | ##### All 2020 PDFs 158 | Pdfs for the 2020 political ads and associated metadata were uploaded to [Overview Docs](https://www.overviewdocs.com/documentsets/22569). To collect the pdfs, the file names were pulled from the [FCC API OPIF file search](https://publicfiles.fcc.gov/developer/) using the search terms: order, contract, invoice, and receipt. The search was run with filters for campaign year set to 2020 and source service code set to TV. 159 | 160 | The FCC API search also returns the source service code (entity type, i.e. TV, cable), entity id, callsign, political file type (political ad or non-candidate issue ad), office type (presidential, senate, etc), nielsen dma rank (tv market area), network affiliation, and the time stamps for when the ad was created and last modified were pulled. These were added to the overview document set along with the search term, URL for the FCC download, and the date of the search. 161 | 162 | For these .pdfs, the following steps were followed to produce training data: 163 | 164 | - Convert pdf to a series of images 165 | - Determine angle of each page and rotate if needed 166 | - Use tesseract to OCR each image 167 | - Upload processed pdf to an S3 bucket and add URL to overview 168 | - Upload additional metadata on whether OCR was needed, the original angle of each page, and any errors that occurred during the OCR process. 169 | 170 | ##### A Subset for Training 171 | [A sample of 1000 documents](https://www.overviewdocs.com/documentsets/22186) was randomly chosen for hand labeling as 2020 training data. 172 | 173 | The label manifest for 2020 data is `data/2020_manifest.csv` (renamed from fcc-data-2020-sample-updated.csv which is the filename it downloads as). If the manifest is not present, it can be recovered from [this overview document set](https://www.overviewdocs.com/documentsets/22186). This file contains our manually entered answers for all of our five targets for the 1000 randomly chosen documents. 174 | 175 | 176 | ### Where the PDFs and token files come from 177 | #### Acquiring .parquet files directly 178 | 179 | The best way to run this project is to acquire the 20,000 .parquet files containing the tokens and geometry for each PDF in the training set. The token files are downloaded from our S3 bucket by running `make data/tokenized`. If you run `make train`, the program will automatically run `make data/tokenized` as this is a dependency for `make train`. These .parquet files are then located in the folder data/tokenized. This is the easiest way to get this data. 180 | 181 | #### Acquiring Raw PDFs 182 | 183 | To find the original PDFs, it is always possible to return to the [FCC website](https://publicfiles.fcc.gov/) and download them directly using the proper filters (search terms: order, contract, invoice, and receipt, filters: campaign year = 2020, source service code = TV). Each of the 2012, 2014 and 2020 data which was used by Pro Publica, by Alex Byrnes or by ourselves to create the three label manifests can be found in a differnt location also as follows: 184 | 185 | ##### 2012 Training PDFs 186 | 187 | 90% of the original PDFs from the Free the Files Project are available on DocumentCloud and can be recovered by running 'curl' on url = 'https://documentcloud.org/documents/' + slug + '.pdf'. These PDFs can also be found in [this folder](https://drive.google.com/drive/folders/1bsV4A-8A9B7KZkzdbsBnCGKLMZftV2fQ?usp=sharing). If you download PDFs from one of these sources, locate them in the folder `data/PDFs` 188 | 189 | ##### 2014 Training PDFs 190 | 191 | [Alex Byrnes github](https://github.com/alexbyrnes/FCC-Political-Ads) directs users back to the [FCC website](https://publicfiles.fcc.gov/) to get his data. He does not host it separately. The PDFs are also available in [this google drive folder](https://drive.google.com/drive/folders/1aTuir0Y6WdD0P3SRUazo_82u7o8SnVf2). If you download PDFs from one of these sources, locate them in the folder `data/PDFs` 192 | 193 | ##### 2020 Training PDFs 194 | 195 | The one thousand 2020 PDFs we hand labeled are available on Overview Docs as [this dataset](https://www.overviewdocs.com/documentsets/22186) 196 | 197 | These PDFs can also be acquired from the FCC database by running `make data/pdfs`. This command will locate all the PDFs associated with 2020 training data in the folder `data/PDFs` 198 | 199 | #### Converting Raw PDFs to .parquet files 200 | 201 | If you have a set of PDF files located in `data/PDFs` and would like to tokenize those PDFs then you can run a line in the make file which is typically commented out. Uncomment ` make data/tokenized: data/pdfs` and the associated lines below and comment out the other make command called data/tokenized. This command will create the folder data/tokenized containing the .parquet files of tokens and geometry corresponding to each of the PDFs in `data/PDFs`. 202 | 203 | ## Training 204 | ### How the model works 205 | 206 | The easiest fields are contract number and total. This uses a fully connected three-layer network trained on a window of tokens from the data, typically 20-30 tokens. Each token is hashed to an integer mod 1000, then converted to 1-hot representation and embedded into 64 dimensions. This embedding is combined with geometry information (bounding box and page number) and also some hand-crafted "hint" features, such as whether the token matches a regular expression for dollar amounts. For details, see [the talk](https://www.youtube.com/watch?v=uNN59kJQ7CA). 207 | 208 | We also incorporate custom "hint" features. For example, the total extractor uses an "amount" feature that is the log of the token value, if the token string is a number. 209 | 210 | 211 | ### Running in Docker 212 | 213 | - `make test` to run all the unit tests for the project 214 | - `make docker-shell` will spin up a container and drop you into a bash shell after mounting the `deepform` folder of code so that commands that you run there reflect the code as you are editing it. 215 | - `make train` runs `deepform/train.py` with the default configuration. **If it needs to it will download and preprocess the data it needs to train on.** 216 | - `make test-train` runs the same training loop on the same data, but with very strongly reduced settings (just a few documents for a few steps) so that it can be used to check that it actually works. 217 | - `make sweep` runs a hyperparameter sweep with Weights & Biases, using the configuration in `sweep.yaml` 218 | 219 | Some of these commands require an `.env` file located at the root of the project directory. 220 | 221 | If you don't want to use Weights & Biases, you can turn it off by setting `use_wandb=0`. You'll still need an `.env` file, but it can be empty. 222 | 223 | ### Running Locally using Poetry 224 | 225 | For each of the above commands, rather than running a make command which automatically runs in docker, run the python command which is a subsection of the make command. I.e. rather than running `,make test-train`, run `python -um deepform.train --len-train=100 --steps-per-epoch=3 --epochs=2 --log-level=DEBUG --use-wandb=0 --use-data-cache=0 --save-model=0 --doc-acc-max-sample-size=20 --render-results-size=3` 226 | 227 | ## Code quality and pre-commit hooks 228 | 229 | The code is currently automatically formatted with [black](https://black.readthedocs.io/en/stable/), uses [autoflake](https://pypi.org/project/autoflake/) to remove unused imports, [isort](https://timothycrosley.github.io/isort/) to sort them, and [flake8](https://flake8.pycqa.org/en/latest/) to check for PEP8 violations. These tools are configured in `pyproject.toml` and should Just Work™ -- you shouldn't have to worry about them at all once you install them. 230 | 231 | To make this as painless as possible, `.pre-commit-config.yaml` contains rules for automatically running these tools as part of `git commit`. To turn these git pre-commit hook on, you need run `pre-commit install` (after installing it and the above libraries with Poetry or pip). After that, whenever you run `git commit`, these tools will run and clean up your code so that "dirty" code never gets committed in the first place. 232 | 233 | GitHub runs a "python build" Action whenever you push new code to a branch (configured in [python-app.yml](https://github.com/project-deepform/deepform/blob/master/.github/workflows/python-app.yml)). This also runs `black`, `flake8`, and `pytest`, so it's best to just make sure things pass locally before pushing to GitHub. 234 | 235 | ## Looking Forward 236 | 237 | This is a difficult data set that is very relevant to journalism, and improvements in technique will be immediately useful to campaign finance reporting. 238 | 239 | Our next steps include additional pre-processing steps to rotate improperly scanned documents and to identify and separate concatenated documents. The default parameter settings we are using are fairly good but can likely be improved further. We have leads on additional training data which was produced via hand-labeling in a couple of different related projects which we are hoping to incorporate. We believe there is potential here for some automated training data creation. Finally, we are not at present making use of the available 2012 and 2014 training data and these daya may be able to dramatically improve model accuracy. 240 | 241 | We would love to hear from you! Contact jstray on [twitter](https://twitter.com/jonathanstray) or through his [blog](http://jonathanstray.com). 242 | -------------------------------------------------------------------------------- /config-defaults.yaml: -------------------------------------------------------------------------------- 1 | wandb_version: 1 2 | 3 | len_train: 4 | dec: number of documents to use (training + validation) 5 | value: 15000 6 | 7 | # training dataset settings required for benchmark submissions 8 | # do not change these if you'd like a pure comparison to the 9 | # other benchmark submissions 10 | val_split: 11 | value: 0.2 12 | random_seed: 13 | value: 23 14 | 15 | # sweeps suggest these are reasonable hyperparameter defaults 16 | window_len: 17 | desc: size of token sequences to train on (and network size!) 18 | value: 25 19 | 20 | # feature generation 21 | pad_windows: 22 | desc: zero pad beginning and end of doc token stream 23 | value: 1 24 | use_amount: 25 | desc: use token dollar value directly as feature 26 | value: 1 27 | use_page: 28 | desc: use token page number as feature 29 | value: 1 30 | use_geom: 31 | desc: use token geometry (bbox corner) as feature 32 | value: 1 33 | use_string: 34 | desc: use token string as feature 35 | value: 1 36 | use_hints: 37 | desc: use hard coded field names ("total") as features 38 | value: 1 39 | 40 | vocab_size: 41 | desc: identify (1-hot encode) this many common tokens 42 | value: 512 43 | vocab_embed_size: 44 | desc: number of outputs in the vocab embedding 45 | value: 16 46 | 47 | # graph feature generation and utilization 48 | use_adjacency_matrix: 49 | desc: whether to generate adjacency matrices and load them in documents 50 | value: 0 51 | 52 | target_thresh: 53 | desc: throw away token matches to PP crowdsourced data that aren't at least this good 54 | value: 0.8 55 | 56 | # network size 57 | num_layers: 58 | desc: number of layers in model, 2 or 3 59 | value: 3 60 | layer_1_size_factor: 61 | desc: layer 1 size = this factor * window_len * token_dims 62 | value: 4 63 | layer_2_size_factor: 64 | desc: layer 2 size = this factor * window_len * token_dims 65 | value: 2 66 | layer_3_size_factor: 67 | desc: layer 3 size = this factor * window_len * token_dims 68 | value: 1 69 | dropout: 70 | value: 0.2 71 | 72 | # training config 73 | epochs: 74 | value: 50 75 | steps_per_epoch: 76 | value: 50 77 | batch_size: 78 | desc: batch size in windows (not docs) 79 | value: 10000 80 | positive_fraction: 81 | desc: target match scores larger than this will becomes positive labels 82 | value: 0.5 83 | permute_tokens: 84 | desc: randomly re-order tokens in each training window 85 | value: 0 86 | 87 | penalize_missed: 88 | desc: how much more a missed 1 counts than a missed 0 in outputs 89 | value: 5 90 | 91 | learning_rate: 92 | value: 0.001 93 | 94 | # Affects prediction 95 | predict_thresh: 96 | desc: predictions below this value count as predicting "None" 97 | value: 0.5 98 | 99 | # These do not affect the training but control various setup and reporting 100 | render_results_size: 101 | desc: log this many PDF images on last epoch 102 | value: 20 103 | use_data_cache: 104 | desc: use pickled saved training data (freezes options like padding, amount_feature) 105 | value: 1 106 | doc_acc_max_sample_size: 107 | desc: never sample more than this many documents 108 | value: 1000 109 | doc_acc_sample_size: 110 | desc: sample epoch+this documents to compute doc_val_acc (uses all docs on last epoch) 111 | value: 10 112 | save_model: 113 | desc: whether to save the trained model 114 | value: 1 115 | model_path: 116 | desc: path to save the model (if not set, autogenerate) 117 | value: "" 118 | model_artifact_name: 119 | desc: used to identify saved models in Weights & Biases 120 | value: deepform-model 121 | use_wandb: 122 | desc: report run to wandb and store annotations 123 | value: 1 124 | log_level: 125 | desc: minimum level to report in the logs 126 | value: INFO 127 | -------------------------------------------------------------------------------- /data/create-training-data.py: -------------------------------------------------------------------------------- 1 | # This takes the token file and does a number of things: 2 | # - rejects documents with too few tokens (need OCR) or no ground truth 3 | # - normalizes page numbers in 0..1 4 | # - provides fuzzy matching scores for each token vs ground truth tokens 5 | 6 | import csv 7 | import decimal 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from fuzzywuzzy import fuzz 12 | 13 | from util import is_dollar_amount, normalize_dollars 14 | 15 | output_docs = 0 16 | 17 | # Data in filings that we want to find. 18 | # We output a column for each one of these, indicating how close the token is to 19 | # the correct answer. 20 | # For our first experiment, just extract gross_amount 21 | # Other possible targets include 'committee','agency','callsign' 22 | targets = ["gross_amount", "contract_number", "committee"] 23 | 24 | filings = pd.read_csv("../source/ftf-all-filings.tsv", sep="\t") 25 | 26 | incsv = pd.read_parquet("training.parquet") 27 | 28 | outcols = ["slug", "page", "x0", "y0", "x1", "y1", "token", "gross_amount"] + targets 29 | outcsv = csv.DictWriter(open("training.csv", mode="w"), fieldnames=outcols) 30 | outcsv.writeheader() 31 | 32 | 33 | # computes fuzzy distance from each token in the series to the target answer for 34 | # the document answer may be multiple tokens, in which case we take the max of 35 | # matches. 36 | def multi_token_target_match(answer, tokens, target, max_n, anstok): 37 | best_match = [0 for i in range(max_n)] 38 | best_idx = [0 for i in range(max_n)] 39 | # Two dimensional because we will have one array for each possible n-gram length. 40 | ratioslist = np.zeros((max_n, len(tokens))) 41 | # For each possible number of tokens in answertoken: 42 | for i in range(max_n): 43 | # For each n-gram of that length in the doc: 44 | for idx in range(0, len(tokens) - i): 45 | # Make it one token so we can compare. 46 | token_string = "".join(str(t) for t in tokens[idx : idx + i + 1]) 47 | # Compare and store the float in match. 48 | match = fuzz.ratio(anstok, token_string) / 100.0 49 | # Update the ratioslist matrix with this match value for the n-gram 50 | # length and index. 51 | ratioslist[i, idx] = match 52 | # Update our vector of best matches for each n-gram. 53 | if match > best_match[i]: 54 | best_match[i] = match 55 | best_idx[i] = idx 56 | print("best_match array: " + str(best_match)) 57 | best_len = np.argmax(best_match) + 1 58 | best_match_idx = best_idx[best_len - 1] 59 | print("Best choice for number of tokens: " + str(best_len)) 60 | print( 61 | "Best Match Token Sequence: " 62 | + str(tokens[best_match_idx : best_match_idx + best_len]) 63 | ) 64 | 65 | scores = np.zeros(len(tokens)) 66 | 67 | # Make a list of all indices from ratioslist[np.argmax(best_match),:] which 68 | # have the best match. 69 | best_idx_list = [ 70 | i 71 | for i, value in enumerate(ratioslist[np.argmax(best_match), :]) 72 | if value == best_match[best_len - 1] 73 | ] 74 | print("Target Occurs at Indices: " + str(best_idx_list)) 75 | 76 | # For each of these indices in scores, set the following best_len tokens 77 | # equal to best_match. 78 | for a in best_idx_list: 79 | for i in range(best_len): 80 | scores[a + i] = best_match[best_len - 1] 81 | 82 | return scores 83 | 84 | 85 | def target_match(answer, tokens, target, max_n): 86 | print() 87 | print("target: " + target) 88 | print("answer: " + str(answer)) 89 | anstok = ( 90 | str(answer).lower().replace(" ", "") 91 | ) # Remove spaces and make the answer lower case 92 | tokens = [token.lower() for token in tokens] # lowercase all the tokens also 93 | 94 | if target == "gross_amount": 95 | 96 | scores = [] 97 | max_n = 1 98 | for token in tokens: 99 | if is_dollar_amount(anstok) and is_dollar_amount(token): 100 | try: 101 | scores.append( 102 | fuzz.ratio(normalize_dollars(anstok), normalize_dollars(token)) 103 | / 100.0 104 | ) 105 | except decimal.InvalidOperation: 106 | # not a number, maybe a date? 107 | scores.append(fuzz.ratio(anstok, token) / 100.0) 108 | else: 109 | scores.append(fuzz.ratio(anstok, token) / 100.0) 110 | 111 | else: 112 | scores = multi_token_target_match(answer, tokens, target, max_n, anstok) 113 | 114 | return scores 115 | 116 | 117 | def process_doc(slug, rows, max_n): 118 | print() 119 | print() 120 | print("--------------------------------") 121 | print(f"Processing {slug} with {len(rows)} tokens") 122 | global output_docs 123 | if len(rows) < 10: 124 | # probably needs OCR 125 | print(f"Skipping {slug} because it has only {len(rows)} tokens") 126 | return 127 | 128 | answers = filings.loc[filings["dc_slug"] == slug] 129 | if len(answers) != 1: 130 | print(f"Skipping {slug} because it matches {len(answers)} rows") 131 | return 132 | answers = answers.iloc[0] 133 | 134 | if answers[targets].isnull().any(): 135 | print( 136 | f"Skipping {slug} because it is missing answers for " 137 | f"{[t for t in targets if pd.isnull(answers[t])]}" 138 | ) 139 | return 140 | 141 | df = pd.DataFrame(rows) 142 | 143 | page = pd.to_numeric(df["page"]) 144 | maxpage = page.max() 145 | if maxpage: # avoid div/0 for one page docs 146 | df["page"] = page / maxpage # last page = 1.0 147 | 148 | for t in targets: 149 | df[t] = target_match( 150 | answers[t], df["token"].fillna(""), t, max_n 151 | ) # The value of the answer and an array of the tokens for that slug 152 | 153 | for _, row in df.iterrows(): 154 | outcsv.writerow(row.to_dict()) 155 | 156 | output_docs += 1 157 | 158 | 159 | # --- Main --- 160 | # Accumulate all rows with the same slug 161 | # active_rows = [] 162 | # active_slug = None 163 | # input_docs = 0 164 | # max_n = 5 165 | # for row in incsv: 166 | # if row["slug"] != active_slug: 167 | # if active_slug: 168 | # process_doc(active_slug, active_rows, max_n) 169 | # input_docs += 1 170 | # active_slug = row["slug"] 171 | # active_rows = [row] 172 | # else: 173 | # active_rows.append(row) 174 | 175 | # print(f"Input documents {input_docs}") 176 | # print(f"Output documents {output_docs}") 177 | 178 | 179 | # --- Main --- 180 | # Accumulate all rows with the same slug 181 | active_rows = [] 182 | # active_slug = None 183 | input_docs = 0 184 | max_n = 5 185 | # for row in incsv: 186 | # if row["slug"] != active_slug: 187 | # if active_slug: 188 | # process_doc(active_slug, active_rows) 189 | # input_docs += 1 190 | # active_slug = row["slug"] 191 | # active_rows = [row] 192 | # else: 193 | # active_rows.append(row) 194 | n = 0 195 | for slug, group in incsv.groupby("slug"): 196 | process_doc(slug, group, max_n) 197 | n += 1 198 | if n > 200: 199 | break 200 | # print(f"Input documents {input_docs}") 201 | # print(f"Output documents {output_docs}") 202 | -------------------------------------------------------------------------------- /deepform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/__init__.py -------------------------------------------------------------------------------- /deepform/artifacts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import wandb 4 | 5 | from deepform.common import MODEL_DIR, WANDB_PROJECT 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser(description="download a model stored in W&B") 9 | parser.add_argument( 10 | "-v", 11 | "--version", 12 | dest="version", 13 | help="model version to download", 14 | default="latest", 15 | ) 16 | args = parser.parse_args() 17 | 18 | run = wandb.init( 19 | project="model-download", 20 | job_type="ps", 21 | allow_val_change=True, 22 | ) 23 | config = run.config 24 | model_name = config.model_artifact_name 25 | artifact_name = f"{WANDB_PROJECT}/{model_name}:{args.version}" 26 | artifact = run.use_artifact(artifact_name, type="model") 27 | artifact_alias = artifact.metadata.get("name") or "unknown" 28 | artifact.download(root=(MODEL_DIR / artifact_alias)) 29 | -------------------------------------------------------------------------------- /deepform/combine_manifests.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from deepform.common import DATA_DIR 7 | 8 | if os.path.exists(DATA_DIR / "3_year_manifest.csv"): 9 | os.remove(DATA_DIR / "3_year_manifest.csv") 10 | 11 | 12 | df12 = pd.read_csv( 13 | DATA_DIR / "2012_manifest.tsv", sep="\t" 14 | ) # Formerly called ftf-all-filings.tsv 15 | df12.insert(0, "serial_num", np.nan) 16 | df12.insert(0, "flight_from", np.nan) 17 | df12.insert(0, "flight_to", np.nan) 18 | df12.insert(0, "issues", np.nan) 19 | df12_new = df12.filter( 20 | [ 21 | "dc_slug", 22 | "serial_num", 23 | "gross_amount", 24 | "committee", 25 | "flight_from", 26 | "flight_to", 27 | "url", 28 | "issues", 29 | ], 30 | axis=1, 31 | ) 32 | df12_new.insert(0, "year", "2012") 33 | df12_new.columns = [ 34 | "year", 35 | "file_id", 36 | "contract_num", 37 | "gross_amount", 38 | "advertiser", 39 | "flight_from", 40 | "flight_to", 41 | "url", 42 | "issues", 43 | ] 44 | 45 | df14 = pd.read_csv( 46 | DATA_DIR / "2014_manifest.tsv", sep="\t" 47 | ) # Formerly called 2014-orders.tsv 48 | df14.insert(0, "gross_amount", np.nan) 49 | df14.insert(0, "url", np.nan) 50 | df14.insert(0, "issues", np.nan) 51 | df14_new = df14.filter( 52 | [ 53 | "id", 54 | "order_revision", 55 | "gross_amount", 56 | "advertiser", 57 | "flight_from", 58 | "flight_to", 59 | "url", 60 | "issues", 61 | ], 62 | axis=1, 63 | ) 64 | df14_new.insert(0, "year", "2014") 65 | df14_new.columns = [ 66 | "year", 67 | "file_id", 68 | "contract_num", 69 | "gross_amount", 70 | "advertiser", 71 | "flight_from", 72 | "flight_to", 73 | "url", 74 | "issues", 75 | ] 76 | 77 | df20 = pd.read_csv( 78 | DATA_DIR / "2020_manifest.csv" 79 | ) # Formerly called fcc-data-2020-sample-updated.csv 80 | df20_new = df20.filter( 81 | [ 82 | "full_file_name", 83 | "serial_num", 84 | "gross_amount", 85 | "advertiser", 86 | "flight_from", 87 | "flight_to", 88 | "url", 89 | "Issues ('', Type, or Token)", 90 | ], 91 | axis=1, 92 | ) 93 | df20_new.insert(0, "year", "2020") 94 | df20_new.columns = [ 95 | "year", 96 | "file_id", 97 | "contract_num", 98 | "gross_amount", 99 | "advertiser", 100 | "flight_from", 101 | "flight_to", 102 | "url", 103 | "issues", 104 | ] 105 | 106 | df = pd.concat([df12_new, df14_new, df20_new]) 107 | 108 | # df.set_index(["year", "slug"]).count(level="year") 109 | 110 | df.to_csv(DATA_DIR / "3_year_manifest.csv", index=False) 111 | -------------------------------------------------------------------------------- /deepform/common.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | S3_BUCKET = "project-deepform" 4 | 5 | ROOT_DIR = Path(__file__).absolute().parents[1] 6 | DATA_DIR = ROOT_DIR / "data" 7 | LOG_DIR = ROOT_DIR / "logs" 8 | PDF_DIR = DATA_DIR / "pdfs" 9 | TOKEN_DIR = DATA_DIR / "tokenized" 10 | LABELED_DIR = DATA_DIR / "labeled" 11 | TRAINING_DIR = DATA_DIR / "training" 12 | TRAINING_INDEX = TRAINING_DIR.parent / "doc_index.parquet" 13 | MODEL_DIR = DATA_DIR / "models" 14 | 15 | WANDB_PROJECT = "deepform_v1" 16 | -------------------------------------------------------------------------------- /deepform/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/data/__init__.py -------------------------------------------------------------------------------- /deepform/data/add_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process a parquet of all training data to add labels and computed features. 3 | 4 | Final data is stored individually (per-document) to enable random access of 5 | small samples, with an index over all the documents. 6 | """ 7 | 8 | import argparse 9 | from concurrent.futures import ThreadPoolExecutor, as_completed 10 | from enum import Enum, auto 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import pandas as pd 15 | import scipy.sparse as sparse 16 | from fuzzywuzzy import fuzz 17 | from tqdm import tqdm 18 | 19 | from deepform.common import DATA_DIR, TOKEN_DIR, TRAINING_DIR, TRAINING_INDEX 20 | from deepform.data.create_vocabulary import get_token_id 21 | from deepform.data.graph_geometry import document_edges 22 | from deepform.logger import logger 23 | from deepform.util import ( 24 | date_similarity, 25 | default_similarity, 26 | dollar_similarity, 27 | is_dollar_amount, 28 | log_dollar_amount, 29 | ) 30 | 31 | 32 | class TokenType(Enum): 33 | NONE = 0 34 | CONTRACT_NUM = auto() 35 | ADVERTISER = auto() 36 | FLIGHT_FROM = auto() 37 | FLIGHT_TO = auto() 38 | GROSS_AMOUNT = auto() 39 | 40 | 41 | LABEL_COLS = { 42 | # Each label column, and the match function that it uses. 43 | "contract_num": default_similarity, 44 | "advertiser": default_similarity, 45 | "flight_from": date_similarity, 46 | "flight_to": date_similarity, 47 | "gross_amount": dollar_similarity, 48 | } 49 | 50 | 51 | def extend_and_write_docs( 52 | source_dir, 53 | manifest, 54 | pq_index, 55 | out_path, 56 | max_token_count, 57 | use_adjacency_matrix=False, 58 | ): 59 | """Split data into individual documents, add features, and write to parquet.""" 60 | 61 | token_files = {p.stem: p for p in source_dir.glob("*.parquet")} 62 | 63 | jobqueue = [] 64 | for row in manifest.itertuples(): 65 | slug = row.file_id 66 | if slug not in token_files: 67 | logger.error(f"No token file for {slug}") 68 | continue 69 | labels = {} 70 | for label_col in LABEL_COLS: 71 | labels[label_col] = getattr(row, label_col) 72 | if not labels[label_col]: 73 | logger.warning(f"'{label_col}' for {slug} is empty") 74 | jobqueue.append( 75 | { 76 | "token_file": token_files[slug], 77 | "dest_file": out_path / f"{slug}.parquet", 78 | "graph_file": out_path / f"{slug}.graph", 79 | "labels": labels, 80 | "max_token_count": max_token_count, 81 | "use_adjacency_matrix": use_adjacency_matrix, 82 | } 83 | ) 84 | 85 | # Spin up a bunch of jobs to do the conversion 86 | with ThreadPoolExecutor() as executor: 87 | doc_jobs = [] 88 | for kwargs in jobqueue: 89 | doc_jobs.append(executor.submit(process_document_tokens, **kwargs)) 90 | 91 | logger.debug("Waiting for jobs to complete") 92 | progress = tqdm(as_completed(doc_jobs), total=len(doc_jobs)) 93 | doc_results = [j.result() for j in progress] 94 | 95 | logger.debug(f"Writing document index to {pq_index}...") 96 | doc_index = pd.DataFrame(doc_results).set_index("slug", drop=True) 97 | doc_index.to_parquet(pq_index) 98 | 99 | 100 | def pq_index_and_dir(pq_index, pq_path=None): 101 | """Get directory for sharded training data, creating if necessary.""" 102 | pq_index = Path(pq_index).resolve() 103 | if pq_path is None: 104 | pq_path = TRAINING_DIR 105 | else: 106 | pq_path = Path(pq_path) 107 | pq_index.parent.mkdir(parents=True, exist_ok=True) 108 | pq_path.mkdir(parents=True, exist_ok=True) 109 | return pq_index, pq_path 110 | 111 | 112 | def process_document_tokens( 113 | token_file, 114 | dest_file, 115 | graph_file, 116 | labels, 117 | max_token_count, 118 | use_adjacency_matrix=False, 119 | ): 120 | """Filter out short tokens, add computed features, and return index info.""" 121 | slug = token_file.stem 122 | tokens = pd.read_parquet(token_file).reset_index(drop=True) 123 | doc, adjacency, best_matches = compute_features( 124 | tokens, labels, max_token_count, use_adjacency_matrix=use_adjacency_matrix 125 | ) 126 | doc.to_parquet(dest_file, index=False) 127 | if adjacency is not None: 128 | write_adjacency(graph_file, adjacency) 129 | # Return the summary information about the document. 130 | return {"slug": slug, "length": len(doc), **labels, **best_matches} 131 | 132 | 133 | def compute_features(tokens, labels, max_token_count, use_adjacency_matrix=False): 134 | doc = label_tokens(tokens, labels, max_token_count) 135 | 136 | # Strip whitespace off all tokens. 137 | doc["token"] = doc.token.str.strip() 138 | 139 | # Remove tokens shorter than three characters. 140 | doc = doc[doc.token.str.len() >= 3] 141 | 142 | # Extend with the straightforward features. 143 | doc = add_base_features(doc) 144 | 145 | # Handle the features that need the whole document. 146 | doc["label"] = np.zeros(len(doc), dtype="u1") 147 | # The "label" column stores the TokenType that correctly labels this token. 148 | # By default this is 0, or "NONE". 149 | best_matches = {} 150 | for feature in LABEL_COLS: 151 | token_value = TokenType[feature.upper()].value 152 | max_score = doc[feature].max() 153 | best_matches[f"best_match_{feature}"] = max_score 154 | matches = token_value * np.isclose(doc[feature], max_score) 155 | doc["label"] = np.maximum(doc["label"], matches) 156 | 157 | adjacency = document_edges(doc) if use_adjacency_matrix else None 158 | return doc, adjacency, best_matches 159 | 160 | 161 | def write_adjacency(graph_file, adjacency): 162 | sparse.save_npz(f"{graph_file}.npz", adjacency) 163 | 164 | 165 | def read_adjacency(graph_file): 166 | return sparse.load_npz(f"{graph_file}.npz") 167 | 168 | 169 | def label_tokens(tokens, labels, max_token_count): 170 | for col_name, label_value in labels.items(): 171 | tokens[col_name] = 0.0 172 | match_fn = LABEL_COLS[col_name] 173 | 174 | if col_name == "advertiser": 175 | tokens[col_name] = label_multitoken( 176 | tokens.token.to_numpy(), label_value, max_token_count, match_fn 177 | ) 178 | else: 179 | tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,)) 180 | 181 | return tokens 182 | 183 | 184 | def label_multitoken(tokens, value, token_count, match_fn=default_similarity): 185 | best_match_values = np.array([match_fn(value, x) for x in tokens]) 186 | for c in range(1, token_count): 187 | texts = [" ".join(tokens[i - c : i]) for i in range(c, tokens.size)] 188 | match_values = np.array([match_fn(value, x) for x in texts] + [0] * c) 189 | for p in range(c): 190 | best_match_values = np.maximum(best_match_values, np.roll(match_values, p)) 191 | return best_match_values 192 | 193 | 194 | def fraction_digits(s): 195 | """Return the fraction of a string that is composed of digits.""" 196 | return np.mean([c.isdigit() for c in s]) if isinstance(s, str) else 0.0 197 | 198 | 199 | def match_string(a, b): 200 | m = fuzz.ratio(a.lower(), b.lower()) / 100.0 201 | return m if m >= 0.9 else 0 202 | 203 | 204 | def add_base_features(token_df): 205 | """Extend a DataFrame with features that can be pre-computed.""" 206 | df = token_df.copy() 207 | df["tok_id"] = df["token"].apply(get_token_id).astype("u2") 208 | df["length"] = df["token"].str.len().astype("i2") 209 | df["digitness"] = df["token"].apply(fraction_digits).astype("f4") 210 | df["is_dollar"] = df["token"].apply(is_dollar_amount).astype("f4") 211 | df["log_amount"] = df["token"].apply(log_dollar_amount).fillna(0).astype("f4") 212 | 213 | return df 214 | 215 | 216 | if __name__ == "__main__": 217 | parser = argparse.ArgumentParser(description=__doc__) 218 | parser.add_argument( 219 | "manifest", 220 | help="CSV with labels for each document", 221 | default=DATA_DIR / "3_year_manifest.csv", 222 | ) 223 | parser.add_argument( 224 | "indir", 225 | nargs="?", 226 | default=TOKEN_DIR, 227 | help="directory of document tokens", 228 | ) 229 | parser.add_argument( 230 | "indexfile", 231 | nargs="?", 232 | default=TRAINING_INDEX, 233 | help="path to index of resulting parquet files", 234 | ) 235 | parser.add_argument( 236 | "outdir", 237 | nargs="?", 238 | default=TRAINING_DIR, 239 | help="directory of parquet files", 240 | ) 241 | parser.add_argument( 242 | "--max-token-count", 243 | type=int, 244 | default=5, 245 | help="maximum number of contiguous tokens to match against each label", 246 | ) 247 | parser.add_argument( 248 | "--compute-graph", dest="use_adjacency_matrix", action="store_true" 249 | ) 250 | parser.set_defaults(use_adjacency_matrix=False) 251 | 252 | parser.add_argument("--log-level", dest="log_level", default="INFO") 253 | args = parser.parse_args() 254 | logger.setLevel(args.log_level.upper()) 255 | 256 | logger.info(f"Reading {Path(args.manifest).resolve()}") 257 | manifest = pd.read_csv(args.manifest) 258 | 259 | indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir) 260 | index.parent.mkdir(parents=True, exist_ok=True) 261 | outdir.mkdir(parents=True, exist_ok=True) 262 | extend_and_write_docs( 263 | indir, 264 | manifest, 265 | index, 266 | outdir, 267 | args.max_token_count, 268 | use_adjacency_matrix=args.use_adjacency_matrix, 269 | ) 270 | -------------------------------------------------------------------------------- /deepform/data/create_vocabulary.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from string import ascii_lowercase 3 | 4 | import pandas as pd 5 | 6 | from deepform.common import DATA_DIR, TOKEN_DIR 7 | 8 | VOCAB_FILE = DATA_DIR / "token_frequency.csv" 9 | 10 | 11 | def clean(token): 12 | """Convert to lowercase and strip out anything other than ascii letters.""" 13 | return "".join(c for c in token.casefold() if c in ascii_lowercase) 14 | 15 | 16 | def per_document_tokens(): 17 | """Generator that produces the unique set of tokens for each document.""" 18 | for doc in TOKEN_DIR.glob("*.parquet"): 19 | yield pd.read_parquet(doc, columns=["token"]).token.apply(clean).unique() 20 | 21 | 22 | def per_document_token_count(): 23 | counts = Counter() 24 | for tokens in per_document_tokens(): 25 | counts.update(tokens) 26 | return counts 27 | 28 | 29 | def create_frequency_file(): 30 | counts = per_document_token_count() 31 | counts_df = pd.DataFrame(counts.most_common(), columns=["token", "count"]) 32 | counts_df.to_csv(VOCAB_FILE) 33 | 34 | 35 | def token_frequencies(): 36 | if not VOCAB_FILE.is_file(): 37 | create_frequency_file() 38 | return pd.read_csv(VOCAB_FILE) 39 | 40 | 41 | class Vocabulary: 42 | def __init__(self): 43 | vocab = token_frequencies().token 44 | self.token_ids = {t: i + 1 for i, t in enumerate(vocab)} 45 | 46 | def __getitem__(self, token): 47 | # Unrecognized words are assigned to 0. 48 | return self.token_ids.get(clean(token), 0) 49 | 50 | 51 | def get_token_id(token): 52 | global _vocabulary_singleton 53 | try: 54 | return _vocabulary_singleton[token] 55 | except NameError: 56 | _vocabulary_singleton = Vocabulary() 57 | return _vocabulary_singleton[token] 58 | 59 | 60 | if __name__ == "__main__": 61 | create_frequency_file() 62 | -------------------------------------------------------------------------------- /deepform/data/graph_geometry.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ma as ma 3 | import scipy.sparse as sparse 4 | 5 | 6 | def document_edges(tokens, relative_tolerance=0.01): 7 | """""" 8 | N = len(tokens) 9 | 10 | # For now we compute alignment of text tokens based on their lower left corner. 11 | dX = np.subtract.outer(tokens["x0"].to_numpy(), tokens["x0"].to_numpy()) 12 | dY = np.subtract.outer(tokens["y1"].to_numpy(), tokens["y1"].to_numpy()) 13 | page_mask = np.not_equal.outer(tokens["page"].to_numpy(), tokens["page"].to_numpy()) 14 | 15 | D = np.abs(dX) + np.abs(dY) 16 | V_sim = dY / D 17 | H_sim = dX / D 18 | 19 | dX_h_aligned = ma.masked_where( 20 | np.logical_or( 21 | page_mask, 22 | np.logical_not(np.isclose(np.abs(H_sim), 1, rtol=relative_tolerance)), 23 | ), 24 | dX, 25 | ) 26 | dY_v_aligned = ma.masked_where( 27 | np.logical_or( 28 | page_mask, 29 | np.logical_not(np.isclose(np.abs(V_sim), 1, rtol=relative_tolerance)), 30 | ), 31 | dY, 32 | ) 33 | 34 | test_right = ma.masked_where(np.greater(dX_h_aligned, 0), dX_h_aligned) 35 | test_bottom = ma.masked_where(np.greater(dY_v_aligned, 0), dY_v_aligned) 36 | 37 | right_max = np.argmax(test_right, axis=0) 38 | bottom_max = np.argmax(test_bottom, axis=0) 39 | 40 | adjacency = sparse.lil_matrix((N, N), dtype=np.bool_) 41 | 42 | for i in range(len(tokens)): 43 | if dX_h_aligned[i, right_max[i]]: 44 | adjacency[i, right_max[i]] = True 45 | adjacency[right_max[i], i] = True 46 | if dY_v_aligned[i, bottom_max[i]]: 47 | adjacency[i, bottom_max[i]] = True 48 | adjacency[bottom_max[i], i] = True 49 | 50 | return adjacency.tocoo() 51 | -------------------------------------------------------------------------------- /deepform/data/tokenize_pdfs.py: -------------------------------------------------------------------------------- 1 | """Create token data for each of the pdfs (or directories of pdfs) passed in.""" 2 | 3 | 4 | import argparse 5 | from concurrent.futures import ThreadPoolExecutor 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import pdfplumber 11 | from tqdm import tqdm 12 | 13 | from deepform.common import PDF_DIR, TOKEN_DIR 14 | from deepform.data.add_features import add_base_features 15 | from deepform.document import FEATURE_COLS, Document 16 | from deepform.logger import logger 17 | from deepform.pdfs import get_pdf_path 18 | 19 | 20 | def tokenize_pdf(pdf_path): 21 | """Return a DataFrame of document token data for a pdf at the input path.""" 22 | pages = [] 23 | for i, page in enumerate(pdfplumber.open(pdf_path).pages): 24 | df = pd.DataFrame(page.extract_words()) 25 | if df.empty: 26 | continue 27 | df["page"] = i 28 | df["page"] = df["page"].astype("i2") 29 | df["x0"] = df["x0"].astype("f4") 30 | df["y0"] = df["top"].astype("f4") 31 | df["x1"] = df["x1"].astype("f4") 32 | df["y1"] = df["bottom"].astype("f4") 33 | df["token"] = df["text"].astype("string") 34 | pages.append(df[["page", "x0", "y0", "x1", "y1", "token"]]) 35 | if not pages: 36 | raise EOFError(f"No tokens found in {pdf_path}") 37 | return pd.concat(pages).reset_index(drop=True) 38 | 39 | 40 | def create_token_doc(pdf_path, token_dir=TOKEN_DIR, overwrite=False): 41 | pdf_path, token_dir = Path(pdf_path), Path(token_dir) 42 | assert pdf_path.is_file() and pdf_path.suffix == ".pdf" 43 | 44 | slug = pdf_path.stem 45 | token_path = token_dir / f"{slug}.parquet" 46 | if token_path.is_file(): 47 | if overwrite: 48 | logger.warning(f"Overwriting {token_path}") 49 | else: 50 | return 51 | 52 | try: 53 | tokens = tokenize_pdf(pdf_path) 54 | except EOFError: 55 | logger.warning(f"pdfplumber found no tokens in '{pdf_path}'") 56 | return 57 | except Exception as e: 58 | logger.error(f"Unable to tokenize {pdf_path}: {e}") 59 | return 60 | 61 | token_dir.mkdir(parents=True, exist_ok=True) 62 | tokens.to_parquet(token_path) 63 | return token_path 64 | 65 | 66 | def pdf_paths(*paths): 67 | for path in paths: 68 | path = Path(path) 69 | if path.is_file(): 70 | if path.suffix != ".pdf": 71 | logger.warning(f"Skipping non-pdf '{path}'") 72 | continue 73 | yield path 74 | elif path.is_dir(): 75 | for file_path in path.glob("*.pdf"): 76 | yield file_path 77 | else: 78 | logger.warning(f"'{path}' is not a file or directory") 79 | 80 | 81 | def create_token_docs_from_pdfs(*paths, overwrite=False): 82 | 83 | with ThreadPoolExecutor() as executor: 84 | pdf_files = list(pdf_paths(*paths)) 85 | print(f"Tokenizing {len(pdf_files):,} pdfs...") 86 | results = list( 87 | tqdm(executor.map(create_token_doc, pdf_files), total=len(pdf_files)) 88 | ) 89 | 90 | tokenized = [p for p in results if p] 91 | print(f"Tokenized {len(tokenized)} documents.") 92 | return tokenized 93 | 94 | 95 | def create_token_docs_from_slugs(slugs, token_dir=TOKEN_DIR): 96 | def tokenize(slug): 97 | pdf_file = get_pdf_path(slug) 98 | return create_token_doc(pdf_file, token_dir=token_dir) 99 | 100 | with ThreadPoolExecutor() as executor: 101 | print(f"Acquiring and tokenizing {len(slugs):,} documents...") 102 | results = list(tqdm(executor.map(tokenize, slugs), total=len(slugs))) 103 | 104 | tokenized = [p for p in results if p] 105 | print(f"Tokenized {len(tokenized)} documents.") 106 | return tokenized 107 | 108 | 109 | def extract_doc(pdf_path, window_len): 110 | """Create a Document with features extracted from a pdf.""" 111 | pdf_path = Path(pdf_path) 112 | tokens = tokenize_pdf(pdf_path) 113 | # Remove tokens shorter than three characters. 114 | df = tokens[tokens["token"].str.len() >= 3] 115 | df = add_base_features(df) 116 | df["tok_id"] = np.minimum(511, df["tok_id"]) 117 | return Document( 118 | slug=pdf_path.stem, 119 | tokens=df, 120 | features=df[FEATURE_COLS].to_numpy(dtype=float), 121 | labels=np.zeros(len(df), dtype=bool), # Dummy. 122 | positive_windows=np.array(0), # Dummy. 123 | window_len=window_len, 124 | label_values={}, 125 | ) 126 | 127 | 128 | if __name__ == "__main__": 129 | parser = argparse.ArgumentParser(description=__doc__) 130 | parser.add_argument( 131 | "-f", 132 | "--force", 133 | type=bool, 134 | default=False, 135 | help="overwrite existing token files", 136 | ) 137 | parser.add_argument( 138 | "pdf", 139 | nargs="?", 140 | default=PDF_DIR, 141 | help="pdf or directory of pdfs to process", 142 | ) 143 | parser.add_argument("--log-level", dest="log_level", default="ERROR") 144 | args = parser.parse_args() 145 | logger.setLevel(args.log_level.upper()) 146 | 147 | create_token_docs_from_pdfs(args.pdf, overwrite=args.force) 148 | -------------------------------------------------------------------------------- /deepform/db/.env: -------------------------------------------------------------------------------- 1 | MYSQL_ROOT_PASSWORD=changeme 2 | -------------------------------------------------------------------------------- /deepform/db/README.md: -------------------------------------------------------------------------------- 1 | # Database 2 | 3 | Tokenized data is stored in a Mariadb database. To run Mariadb locally: 4 | 5 | ## Setup 6 | 7 | Although we're running Mariadb in a Docker container, you'll probably want the MySQL command line utilities. If you don't already have these, you can install them with `brew install mysql` in OS X. 8 | 9 | To run the Docker container, run the following command optionally changing the password set in `.env`. 10 | 11 | ``` 12 | docker run --name mariadb -v data:/var/lib/mysql -v conf:/etc/mysql/conf.d --env-file .env -p=3306:3306 -d mariadb:10.5.1 13 | ``` 14 | 15 | The data loading scripts are useful for loading the example data into the database and assume execution from this directory. The scripts also assume the existence of the files `source/ftf-all-filings.tsv` and `data/training.csv` in this repository. 16 | 17 | ``` 18 | mysql -uroot -p --protocol tcp < scripts/create_schema.sql 19 | mysql -uroot -p --protocol tcp deepform < scripts/load_document_data.sql 20 | mysql -uroot -p --protocol tcp deepform < scripts/load_token_data.sql 21 | ``` 22 | 23 | ## Further notes 24 | 25 | When running a Mariadb database in Docker, you'll need to specify the protocol to use when interacting with the database like so: 26 | 27 | ``` 28 | mysql -uroot -p -e "SHOW CREATE DATABASE deepform;" --protocol tcp deepform 29 | ``` 30 | 31 | The `mysql` command defaults to using unix file sockets if no protocol is specified, and won't connect to the database. 32 | -------------------------------------------------------------------------------- /deepform/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/db/__init__.py -------------------------------------------------------------------------------- /deepform/db/conf/config-file.cnf: -------------------------------------------------------------------------------- 1 | bind-address=0.0.0.0 2 | -------------------------------------------------------------------------------- /deepform/db/scripts/create_schema.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE `deepform`; 2 | 3 | USE `deepform`; 4 | 5 | CREATE TABLE `document` ( 6 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 7 | `dc_slug` varchar(256) DEFAULT NULL, 8 | `filing_type` varchar(256) DEFAULT NULL, 9 | `contract_number` int(11) DEFAULT NULL, 10 | `url` varchar(256) DEFAULT NULL, 11 | `committee` varchar(256) DEFAULT NULL, 12 | `agency` varchar(256) DEFAULT NULL, 13 | `callsign` varchar(10) DEFAULT NULL, 14 | `thumbnail_url` varchar(256) DEFAULT NULL, 15 | `market_id` int(11) DEFAULT NULL, 16 | `upload_date` datetime DEFAULT NULL, 17 | `gross_amount_usd` double DEFAULT NULL, 18 | PRIMARY KEY (`id`), 19 | UNIQUE KEY `uniq_dc_slug` (`dc_slug`) 20 | ) ENGINE=InnoDB AUTO_INCREMENT=68175 DEFAULT CHARSET=latin1; 21 | 22 | CREATE TABLE `token` ( 23 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 24 | `dc_slug` varchar(256) DEFAULT NULL, 25 | `page` float DEFAULT NULL, 26 | `x0` double DEFAULT NULL, 27 | `y0` double DEFAULT NULL, 28 | `x1` double DEFAULT NULL, 29 | `y1` double DEFAULT NULL, 30 | `token` varchar(256) DEFAULT NULL, 31 | `gross_amount` double DEFAULT NULL, 32 | PRIMARY KEY (`id`), 33 | KEY `origin_document` (`dc_slug`) 34 | ) ENGINE=InnoDB AUTO_INCREMENT=14024491 DEFAULT CHARSET=latin1; 35 | -------------------------------------------------------------------------------- /deepform/db/scripts/load_document_data.sql: -------------------------------------------------------------------------------- 1 | LOAD DATA LOCAL INFILE '../source/ftf-all-filings.tsv' 2 | INTO TABLE document 3 | COLUMNS TERMINATED BY '\t' 4 | IGNORE 1 LINES 5 | (id, filing_type, contract_number, url, committee, agency, callsign, dc_slug, thumbnail_url, gross_amount_usd, market_id, upload_date); 6 | -------------------------------------------------------------------------------- /deepform/db/scripts/load_token_data.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE token DISABLE KEYS; 2 | BEGIN; 3 | LOAD DATA LOCAL INFILE '../data/training.csv' 4 | INTO TABLE token 5 | COLUMNS TERMINATED BY ',' 6 | IGNORE 1 LINES 7 | (dc_slug,page,x0,y0,x1,y1,token,gross_amount); 8 | COMMIT; 9 | ALTER TABLE token ENABLE KEYS; 10 | -------------------------------------------------------------------------------- /deepform/db/source.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | import pandas as pd 4 | from sqlalchemy import create_engine 5 | 6 | charset = string.printable + "\t\n\x00" 7 | 8 | 9 | def connection(user, password, host="127.0.0.1", port=3306, dbname="deepform"): 10 | engine = create_engine( 11 | f"mysql+mysqldb://{user}:{password}@{host}:{port}/{dbname}", pool_recycle=3600 12 | ) 13 | return engine.connect() 14 | 15 | 16 | def clean_text(text): 17 | def clean_char(c): 18 | if c in charset: 19 | return c 20 | else: 21 | return "\x00" 22 | 23 | return [clean_char(x) for x in text] 24 | 25 | 26 | def input_generator(conn, max_docs=10, truncate_length=3000): 27 | documents = pd.read_sql( 28 | f"select * from document " 29 | f"where committee != '' order by rand() limit {max_docs};", 30 | conn, 31 | ) 32 | for document in documents.itertuples(): 33 | doc_id = document.dc_slug 34 | tokens = pd.read_sql(f"select * from token where dc_slug = '{doc_id}';", conn) 35 | text = " ".join([str(token) for token in tokens["token"]]) 36 | # yield clean_text(text), clean_text(document.committee) 37 | yield text, document.committee 38 | 39 | 40 | def input_docs(conn, max_docs=10, minimum_doc_length=30): 41 | try: 42 | emitted_docs = 0 43 | raw_conn = conn.engine.raw_connection() 44 | cursor = raw_conn.cursor() 45 | cursor.execute( 46 | "select dc_slug, committee, gross_amount_usd from document where committee \ 47 | != '' order by rand()" 48 | ) 49 | while emitted_docs < max_docs: 50 | doc = cursor.fetchone() 51 | if doc: 52 | dc_slug, committee, gross_amount_usd = (doc[0], doc[1], doc[2]) 53 | rows = pd.read_sql( 54 | f"select * from token where dc_slug = '{dc_slug}';", conn 55 | ) 56 | if len(rows) < minimum_doc_length: 57 | continue 58 | else: 59 | yield dc_slug, committee, gross_amount_usd, rows 60 | emitted_docs += 1 61 | else: 62 | break 63 | finally: 64 | conn.close() 65 | 66 | 67 | if __name__ == "__main__": 68 | conn = connection("root", "changeme") 69 | docs = input_docs(conn) 70 | for doc in docs: 71 | print(doc) 72 | print("*****") 73 | -------------------------------------------------------------------------------- /deepform/document.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.sparse as sparse 8 | 9 | from deepform.data.add_features import TokenType, read_adjacency 10 | from deepform.features import fix_dtypes 11 | from deepform.util import any_match, pad_sparse_matrix 12 | 13 | FEATURE_COLS = [ 14 | "tok_id", 15 | "page", 16 | "x0", 17 | "y0", 18 | "length", 19 | "digitness", 20 | "is_dollar", 21 | "log_amount", 22 | ] 23 | NUM_FEATURES = len(FEATURE_COLS) 24 | 25 | TOKEN_COLS = [ 26 | "token", 27 | "x0", 28 | "y0", 29 | "x1", 30 | "y1", 31 | "page", 32 | # The following are "match %" for the known fields 33 | "contract_num", 34 | "advertiser", 35 | "flight_from", 36 | "flight_to", 37 | "gross_amount", 38 | ] 39 | 40 | 41 | # This sets which field the model is looking for. 42 | SINGLE_CLASS_PREDICTION = "gross_amount" 43 | 44 | 45 | @dataclass 46 | class Window: 47 | """A Window just holds views to the arrays held by a Document.""" 48 | 49 | tokens: pd.DataFrame 50 | features: np.ndarray 51 | labels: np.ndarray 52 | 53 | def __len__(self): 54 | return len(self.labels) 55 | 56 | 57 | @dataclass(frozen=True) 58 | class Document: 59 | slug: str 60 | # tokens, features, and labels are all aligned with the same indices. 61 | tokens: pd.DataFrame 62 | features: np.ndarray 63 | labels: np.ndarray 64 | # positive_windows is a list of which (starting) indices have a match. 65 | positive_windows: np.ndarray 66 | window_len: int 67 | label_values: dict[str, str] 68 | adjacency_matrix: sparse.coo_matrix 69 | 70 | def random_window(self, require_positive=False): 71 | if require_positive and len(self.positive_windows): 72 | index = np.random.choice(self.positive_windows) 73 | else: 74 | index = np.random.randint(len(self)) 75 | return self[index] 76 | 77 | def __getitem__(self, n): 78 | """Return the `n`th window in the document.""" 79 | k = n + self.window_len 80 | return Window(self.tokens.iloc[n:k], self.features[n:k], self.labels[n:k]) 81 | 82 | def __len__(self): 83 | """Return the number of windows in the document. 84 | 85 | Note that unless window_len=1, this is less than the number of tokens. 86 | """ 87 | return len(self.labels) - self.window_len + 1 88 | 89 | def __iter__(self): 90 | """Iterate over all windows in the document in order.""" 91 | for i in range(len(self)): 92 | yield self[i] 93 | 94 | def predict_scores(self, model): 95 | """Use a model to predict labels for each of the document tokens.""" 96 | windowed_features = np.stack([window.features for window in self]) 97 | window_scores = model.predict(windowed_features) 98 | 99 | num_windows = len(self.labels) 100 | scores = np.zeros((num_windows, len(TokenType))) 101 | for i, window_score in enumerate(window_scores): 102 | scores[i : i + self.window_len, :] += window_score / self.window_len 103 | 104 | return scores 105 | 106 | def predict_answer(self, model, threshold): 107 | """Score each token and return all texts that exceed the threshold.""" 108 | # The first score column is how "irrelevant" a token is, so drop it. 109 | scores = self.predict_scores(model)[:, 1:] 110 | 111 | score_texts, individual_scores = [], [] 112 | for column in scores.T: 113 | text, score = best_token(column, self.tokens.token, threshold) 114 | score_texts.append(text) 115 | individual_scores.append(score) 116 | 117 | return score_texts, individual_scores, scores 118 | 119 | def show_predictions(self, pred_texts, pred_scores, scores): 120 | """Predict token scores and print them alongside the tokens and true labels.""" 121 | title = f"======={self.slug}=======" 122 | predicted = "field (predicted / actual ):\n" 123 | 124 | df = pd.DataFrame({"token": self.tokens.token.str.slice(0, 20)}) 125 | df["label"] = [TokenType(x).name if x else "" for x in self.labels] 126 | 127 | for i, item in enumerate(self.label_values.items()): 128 | name, value = item 129 | x = "✔️" if any_match(pred_texts[i], value) else "❌" 130 | predicted += f"\t{x}{name}: {pred_texts[i]} / {value} <{pred_scores[i]}>\n" 131 | df[name] = [f"{'*' if s > 0.5 else ''} {s:0.5f}" for s in scores[:, i]] 132 | 133 | df = df.iloc[self.window_len - 1 : 1 - self.window_len] 134 | return "\n".join([title, predicted, df.to_string()]) 135 | 136 | @staticmethod 137 | def from_parquet(slug, label_values, pq_path, graph_path, config): 138 | """Load precomputed features from a parquet file and apply a config.""" 139 | df = pd.read_parquet(pq_path) 140 | 141 | df["tok_id"] = ( 142 | np.minimum(df["tok_id"], config.vocab_size - 1) * config.use_string 143 | ) 144 | df["page"] *= config.use_page 145 | df["x0"] *= config.use_geom 146 | df["y0"] *= config.use_geom 147 | df["log_amount"] *= config.use_amount 148 | 149 | adjacency = read_adjacency(graph_path) if config.use_adjacency_matrix else None 150 | 151 | if config.pad_windows: 152 | df = pad_df(df, config.window_len - 1) 153 | if adjacency is not None: 154 | adjacency = pad_adjacency(adjacency, config.window_len - 1) 155 | fix_dtypes(df) 156 | 157 | # Pre-compute which windows have the desired token. 158 | positive_windows = [] 159 | for i in range(len(df) - config.window_len): 160 | if df["label"].iloc[i : i + config.window_len].any(): 161 | positive_windows.append(i) 162 | 163 | # We're no longer requiring that there exists a correct answer. 164 | # assert len(positive_windows) > 0 165 | 166 | return Document( 167 | slug=slug, 168 | tokens=df[TOKEN_COLS], 169 | features=df[FEATURE_COLS].to_numpy(dtype=float), 170 | labels=df["label"].to_numpy(dtype=int), 171 | positive_windows=np.array(positive_windows), 172 | window_len=config.window_len, 173 | label_values=label_values, 174 | adjacency_matrix=adjacency, 175 | ) 176 | 177 | 178 | def pad_df(df, num_rows): 179 | """Add `num_rows` NaNs to the start and end of a DataFrame.""" 180 | if num_rows: 181 | zeros = pd.DataFrame(index=pd.RangeIndex(num_rows)) 182 | return pd.concat([zeros, df, zeros]).reset_index(drop=True) 183 | else: 184 | return df 185 | 186 | 187 | def pad_adjacency(adjacency, num_rows): 188 | """Add blank rows to the square adjacency matrix""" 189 | if num_rows: 190 | return pad_sparse_matrix(adjacency, num_rows, num_rows) 191 | else: 192 | return adjacency 193 | 194 | 195 | def actual_value(df, value_col, match_col): 196 | """Return the best value from `value_col`, as evaluated by `match_col`.""" 197 | index = df[match_col].argmax() 198 | return df.iloc[index][value_col] 199 | 200 | 201 | def best_token(scores, tokens, threshold): 202 | # All runs of tokens where each token meets the threshold. 203 | options = list(selected_tokens(scores, tokens, threshold)) 204 | if options: 205 | # Take the text with the highest score. 206 | score, text = list(sorted(options, key=lambda t: t[0] * len(t[1])))[-1] 207 | else: 208 | # No sequence meets the threshold, so choose the best single token. 209 | text = tokens[np.argmax(scores)] 210 | score = np.max(scores) 211 | return text, score 212 | 213 | 214 | def selected_tokens(scores, tokens, threshold): 215 | """Yield all consecutive runs of tokens where each token exceeds the threshold.""" 216 | current_strings, current_score, count = [], 0, 0 217 | for s, t in zip(scores, tokens): 218 | if s > threshold: 219 | current_strings.append(t) 220 | current_score += s 221 | count += 1 222 | elif count > 0: 223 | yield current_score / count, " ".join(current_strings) 224 | current_strings, current_score, count = [], 0, 0 225 | if count > 0: 226 | yield current_score / count, " ".join(current_strings) 227 | -------------------------------------------------------------------------------- /deepform/document_store.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import random 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | from joblib import dump, load 8 | from tqdm import tqdm 9 | 10 | from deepform.data.add_features import LABEL_COLS, pq_index_and_dir 11 | from deepform.document import Document 12 | from deepform.logger import logger 13 | 14 | 15 | @dataclass(frozen=True) 16 | class DocumentStore: 17 | documents: list 18 | 19 | def __len__(self): 20 | return len(self.documents) 21 | 22 | def __iter__(self): 23 | for doc in self.documents: 24 | yield doc 25 | 26 | def __getitem__(self, n): 27 | """Return the pre-processed tokens for a specified document.""" 28 | return self.documents[n] 29 | 30 | def random_document(self): 31 | return random.choice(self.documents) 32 | 33 | def sample(self, n=None): 34 | if n is None: 35 | n = len(self) 36 | return DocumentStore(random.sample(self.documents, k=n)) 37 | 38 | def split(self, val_percent=0.2): 39 | """Divide into two DocumentStores, e.g. a training and a validation set.""" 40 | docs_copy = copy.deepcopy(self.documents) 41 | random.shuffle(docs_copy) 42 | split_index = int(val_percent * len(self)) 43 | return DocumentStore(docs_copy[:split_index]), DocumentStore( 44 | docs_copy[split_index:] 45 | ) 46 | 47 | @staticmethod 48 | def open(index_file, config): 49 | """Load the documents referenced by `index_file` and apply `config`.""" 50 | index_file = Path(index_file) 51 | doc_index = pd.read_parquet(index_file) 52 | logger.info(f"{len(doc_index)} documents in index") 53 | 54 | if not config.pad_windows: 55 | # Filter out documents that are too short for the curent config. 56 | doc_index = doc_index[doc_index["length"] >= config.window_len] 57 | 58 | # Filter out documents that don't have a sufficiently high match. 59 | # doc_index = doc_index[doc_index["best_match"] >= config.target_thresh] 60 | logger.info(f"After applying config {len(doc_index)} documents are available") 61 | 62 | # Sample down to no more than the requested number of documents. 63 | num_docs = min(config.len_train, len(doc_index)) 64 | doc_index = doc_index.sample(n=num_docs) 65 | 66 | # Load each of the documents, finishing any necessary feature computation. 67 | slug_to_doc = caching_doc_getter(index_file, config) 68 | # docs = concurrent.thread_map(slug_to_doc, doc_index["slug"]) 69 | 70 | labels = doc_index[LABEL_COLS.keys()] 71 | docs = [ 72 | slug_to_doc(slug, labels.loc[slug]) 73 | for slug in tqdm(doc_index.index, desc="Creating docs") 74 | ] 75 | docs = [d for d in docs if d != None] # noqa: E711 76 | 77 | return DocumentStore(docs) 78 | 79 | 80 | def caching_doc_getter(index_file, config): 81 | _, pq_root = pq_index_and_dir(index_file) 82 | if config.use_data_cache: 83 | cache_root = pq_root.parent / "cache" / cache_master_key(config) 84 | cache_root.mkdir(parents=True, exist_ok=True) 85 | 86 | def slug_to_doc(slug, labels): 87 | pq_path = pq_root / f"{slug}.parquet" 88 | graph_path = pq_root / f"{slug}.graph" 89 | if config.use_data_cache: 90 | cache_path = cache_root / f"{slug}.joblib" 91 | try: 92 | with open(cache_path, "rb") as infile: 93 | return load(infile) 94 | except FileNotFoundError: 95 | logger.debug(f"Cache file {cache_path} not found") 96 | try: 97 | doc = Document.from_parquet(slug, labels, pq_path, graph_path, config) 98 | except AssertionError: 99 | logger.warning(f"No correct answers for {slug}, skipping") 100 | return None 101 | if config.use_data_cache: 102 | with open(cache_path, "wb") as outfile: 103 | dump(doc, outfile) 104 | logger.debug(f"Wrote document to cache file {cache_path}") 105 | return doc 106 | 107 | return slug_to_doc 108 | 109 | 110 | def cache_master_key(config): 111 | """Create a string determined by any cache-invalidating config elements.""" 112 | return ( 113 | "str{use_string}_" 114 | "vocab{vocab_size}_" 115 | "pg{use_page}_" 116 | "geom{use_geom}_" 117 | "amt{use_amount}_" 118 | "pad{pad_windows}_" 119 | "len{window_len}" 120 | ).format(**config) 121 | -------------------------------------------------------------------------------- /deepform/features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | STRING_COLS = ["slug", "token"] 4 | INT_COLS = ["tok_id", "length", "label"] 5 | FLOAT_COLS = [ 6 | "page", 7 | "x0", 8 | "y0", 9 | "x1", 10 | "y1", 11 | "gross_amount", 12 | "match", 13 | "digitness", 14 | "log_amount", 15 | ] 16 | BOOL_COLS = ["is_dollar"] 17 | 18 | 19 | def fix_type(df, col, na_value, dtype, downcast=False): 20 | if col not in df.columns: 21 | return 22 | df[col] = df[col].fillna(na_value).astype(dtype) 23 | if downcast: 24 | try: 25 | df[col] = pd.to_numeric(df[col], downcast=dtype) 26 | except ValueError: 27 | print(f"Unable to downcast column {col} as {dtype}") 28 | print(df[col]) 29 | 30 | 31 | def fix_dtypes(df): 32 | # Use new-style Pandas string types. 33 | for col in STRING_COLS: 34 | fix_type(df, col, na_value="", dtype="string") 35 | 36 | for col in BOOL_COLS: 37 | fix_type(df, col, na_value=0, dtype="bool") 38 | 39 | for col in INT_COLS: 40 | fix_type(df, col, na_value=0, dtype="int") 41 | 42 | for col in FLOAT_COLS: 43 | fix_type(df, col, na_value=0.0, dtype="float", downcast=True) 44 | -------------------------------------------------------------------------------- /deepform/infer.py: -------------------------------------------------------------------------------- 1 | """Use a model to infer predicted values for a document.""" 2 | 3 | 4 | import argparse 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from deepform.common import DATA_DIR, TOKEN_DIR 11 | from deepform.data.add_features import TokenType, add_base_features, as_completed 12 | from deepform.data.tokenize_pdfs import extract_doc 13 | from deepform.document import FEATURE_COLS, Document, pad_df 14 | from deepform.features import fix_dtypes 15 | from deepform.model import load_model 16 | 17 | 18 | def infer_from_pdf(pdf_path, model=None, window_len=None): 19 | """Extract features from a PDF and run infrence on it.""" 20 | if not model: 21 | model, window_len = load_model() 22 | if not window_len: 23 | raise Exception("No window_len param provided or inferrable") 24 | 25 | doc = extract_doc(pdf_path, window_len) 26 | 27 | best_score_texts, individual_scores, _ = doc.predict_answer(model) 28 | 29 | # TODO: clean up the column name from the token type enum 30 | predictions = { 31 | str(column.name.lower()): {"prediction": text, "score": score} 32 | for text, score, column in zip( 33 | best_score_texts, individual_scores, np.array(TokenType)[1:] 34 | ) 35 | } 36 | 37 | return predictions 38 | 39 | 40 | def predict(token_file, model, window_len): 41 | slug = token_file.stem 42 | doc = tokens_to_doc(token_file, window_len) 43 | 44 | predict_texts, predict_scores, _ = doc.predict_answer(model, 0.5) 45 | fields = [tt.name.lower() for tt in TokenType if tt.value > 0] 46 | predictions = [] 47 | for field, text, score in zip(fields, predict_texts, predict_scores): 48 | predictions.append({"slug": slug, "field": field, "text": text, "score": score}) 49 | return pd.DataFrame(predictions) 50 | 51 | 52 | def predict_many(token_files, model_file): 53 | model, window_len = load_model(args.model) 54 | return pd.concat(predict(t, model, window_len) for t in token_files) 55 | 56 | 57 | def tokens_to_doc(token_file, window_len=25): 58 | """Create a Document with features extracted from a pdf.""" 59 | tokens = pd.read_parquet(token_file) 60 | # Remove tokens shorter than three characters. 61 | df = tokens[tokens["token"].str.len() >= 3] 62 | df = add_base_features(df) 63 | df["tok_id"] = np.minimum(511, df["tok_id"]) 64 | df = pad_df(df, window_len - 1) 65 | fix_dtypes(df) 66 | return Document( 67 | slug=token_file.stem, 68 | tokens=df, 69 | features=df[FEATURE_COLS].to_numpy(dtype=float), 70 | labels=np.zeros(len(df), dtype=bool), # Dummy. 71 | positive_windows=np.array(0), # Dummy. 72 | window_len=window_len, 73 | label_values={}, 74 | ) 75 | 76 | 77 | if __name__ == "__main__": 78 | parser = argparse.ArgumentParser(description=__doc__) 79 | parser.add_argument( 80 | "-m", "--model", dest="model", help="model file to use in prediction" 81 | ) 82 | args = parser.parse_args() 83 | 84 | manifest = pd.read_csv(DATA_DIR / "fcc-data-2020-labeled-manifest.csv") 85 | slugs = set(manifest.file_id) 86 | token_files = [t for t in TOKEN_DIR.glob("*.parquet") if t.stem in slugs] 87 | token_files.sort() 88 | 89 | # Spin up a bunch of jobs to do the conversion 90 | with ThreadPoolExecutor() as executor: 91 | doc_jobs = [] 92 | for i in range(0, len(token_files), 100): 93 | batch = token_files[i : i + 100] 94 | doc_jobs.append(executor.submit(predict_many, batch, args.model)) 95 | 96 | doc_results = [] 97 | for p in as_completed(doc_jobs): 98 | result = p.result() 99 | doc_results.append(result) 100 | print(result.to_string()) 101 | 102 | results = pd.concat(doc_results).reset_index(drop=True) 103 | results.to_csv("predict_on_known.csv", index=False) 104 | -------------------------------------------------------------------------------- /deepform/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | -------------------------------------------------------------------------------- /deepform/model.py: -------------------------------------------------------------------------------- 1 | import random 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow import keras 8 | from tensorflow.keras.layers import ( 9 | Dense, 10 | Dropout, 11 | Embedding, 12 | Flatten, 13 | Lambda, 14 | Reshape, 15 | Softmax, 16 | concatenate, 17 | ) 18 | from tensorflow.keras.models import Model 19 | 20 | from deepform.common import MODEL_DIR 21 | from deepform.data.add_features import TokenType 22 | from deepform.document import NUM_FEATURES 23 | from deepform.util import git_short_hash 24 | 25 | 26 | # control the fraction of windows that include a positive label. not efficient. 27 | def one_window(dataset, config): 28 | require_positive = random.random() > config.positive_fraction 29 | window = dataset.random_document().random_window(require_positive) 30 | if config.permute_tokens: 31 | shuffle = np.random.permutation(config.window_len) 32 | window.features = window.features[shuffle] 33 | window.labels = window.labels[shuffle] 34 | return window 35 | 36 | 37 | def windowed_generator(dataset, config): 38 | # Create empty arrays to contain batch of features and labels# 39 | batch_features = np.zeros((config.batch_size, config.window_len, NUM_FEATURES)) 40 | batch_labels = np.zeros((config.batch_size, config.window_len)) 41 | 42 | while True: 43 | for i in range(config.batch_size): 44 | window = one_window(dataset, config) 45 | batch_features[i, :, :] = window.features 46 | batch_labels[i, :] = window.labels # tf.one_hot(window.labels, 2) 47 | yield batch_features, batch_labels 48 | 49 | 50 | # ---- Custom loss function is basically MSE but high penalty for missing a 1 label --- 51 | def missed_token_loss(one_penalty): 52 | def _missed_token_loss(y_true, y_pred): 53 | expected_zero = tf.cast(tf.math.equal(y_true, 0), tf.float32) 54 | s = y_pred * expected_zero 55 | zero_loss = tf.keras.backend.mean(tf.keras.backend.square(s)) 56 | expected_one = tf.cast(tf.math.equal(y_true, 1), tf.float32) 57 | t = one_penalty * (1 - y_pred) * expected_one 58 | one_loss = tf.keras.backend.mean(tf.keras.backend.square(t)) 59 | return zero_loss + one_loss 60 | 61 | return _missed_token_loss # closes over one_penalty 62 | 63 | 64 | # --- Specify network --- 65 | def create_model(config): 66 | indata = tf.keras.Input((config.window_len, NUM_FEATURES)) 67 | 68 | # split into the hash and the rest of the token features, embed hash as 69 | # one-hot, then merge 70 | def create_tok_hash(x): 71 | import tensorflow as tf 72 | 73 | return tf.squeeze(tf.slice(x, (0, 0, 0), (-1, -1, 1)), axis=2) 74 | 75 | def create_tok_features(x): 76 | import tensorflow as tf 77 | 78 | return tf.slice(x, (0, 0, 1), (-1, -1, -1)) 79 | 80 | tok_hash = Lambda(create_tok_hash)(indata) 81 | tok_features = Lambda(create_tok_features)(indata) 82 | embed = Embedding(config.vocab_size, config.vocab_embed_size)(tok_hash) 83 | merged = concatenate([embed, tok_features], axis=2) 84 | 85 | f = Flatten()(merged) 86 | d1 = Dense( 87 | int(config.window_len * NUM_FEATURES * config.layer_1_size_factor), 88 | activation="sigmoid", 89 | )(f) 90 | d2 = Dropout(config.dropout)(d1) 91 | d3 = Dense( 92 | int(config.window_len * NUM_FEATURES * config.layer_2_size_factor), 93 | activation="sigmoid", 94 | )(d2) 95 | d4 = Dropout(config.dropout)(d3) 96 | 97 | if config.num_layers == 3: 98 | d5 = Dense( 99 | int(config.window_len * NUM_FEATURES * config.layer_3_size_factor), 100 | activation="sigmoid", 101 | )(d4) 102 | last_layer = Dropout(config.dropout)(d5) 103 | else: 104 | last_layer = d4 105 | 106 | preout = Dense(config.window_len * len(TokenType), activation="linear")(last_layer) 107 | shaped = Reshape((config.window_len, len(TokenType)))(preout) 108 | outdata = Softmax(axis=-1)(shaped) 109 | model = Model(inputs=[indata], outputs=[outdata]) 110 | 111 | # _missed_token_loss = missed_token_loss(config.penalize_missed) 112 | 113 | model.compile( 114 | optimizer=tf.keras.optimizers.Adam(learning_rate=config.learning_rate), 115 | loss=tf.keras.losses.SparseCategoricalCrossentropy(), 116 | metrics=["acc"], 117 | ) 118 | 119 | return model 120 | 121 | 122 | def default_model_name(window_len): 123 | timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") 124 | return MODEL_DIR / f"{timestamp}_{git_short_hash()}_{window_len}.model" 125 | 126 | 127 | def latest_model(): 128 | models = MODEL_DIR.glob("*.model") 129 | return max(models, key=lambda p: p.stat().st_ctime) 130 | 131 | 132 | def load_model(model_file=None): 133 | filepath = Path(model_file) if model_file else latest_model() 134 | window_len = int(filepath.stem.split("_")[-1]) 135 | model = keras.models.load_model( 136 | filepath, custom_objects={"_missed_token_loss": missed_token_loss(5)} 137 | ) 138 | return model, window_len 139 | 140 | 141 | def save_model(model, config): 142 | basename = ( 143 | Path(config.model_path) 144 | if config.model_path 145 | else default_model_name(config.window_len) 146 | ) 147 | basename.parent.mkdir(parents=True, exist_ok=True) 148 | model.save(basename) 149 | return basename 150 | -------------------------------------------------------------------------------- /deepform/pdfs.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor 2 | from decimal import Decimal 3 | 4 | import boto3 5 | import numpy as np 6 | import pdfplumber 7 | import wandb 8 | from botocore import UNSIGNED 9 | from botocore.config import Config 10 | from botocore.exceptions import ClientError 11 | from tqdm import tqdm 12 | 13 | from deepform.common import PDF_DIR, S3_BUCKET 14 | from deepform.document import SINGLE_CLASS_PREDICTION 15 | from deepform.logger import logger 16 | from deepform.util import docrow_to_bbox, dollar_match, wandb_bbox 17 | 18 | 19 | def get_pdf_path(slug): 20 | """Return a path to the pdf with the given slug, downloading the file if necessary. 21 | 22 | If the pdf isn't in the local file system, download it from an external repository. 23 | """ 24 | filename = slug + ("" if slug.endswith(".pdf") else ".pdf") 25 | location = PDF_DIR / filename 26 | if not location.is_file(): 27 | PDF_DIR.mkdir(parents=True, exist_ok=True) 28 | download_from_remote(location) 29 | return location 30 | 31 | 32 | def get_pdf_paths(slugs): 33 | with ThreadPoolExecutor() as executor: 34 | print(f"Getting {len(slugs):,} pdfs...") 35 | for path in tqdm(executor.map(get_pdf_path, slugs), total=len(slugs)): 36 | yield path 37 | 38 | 39 | def download_from_remote(local_path): 40 | """Copy a pdf from S3 into the local filesystem.""" 41 | filename = local_path.name 42 | s3_key = "pdfs/" + filename 43 | s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) 44 | try: 45 | s3.Bucket(S3_BUCKET).download_file(s3_key, str(local_path)) 46 | except ClientError: 47 | logger.error(f"Unable to retrieve {s3_key} from s3://{S3_BUCKET}") 48 | raise 49 | 50 | 51 | def log_wandb_pdfs(doc, doc_log, all_scores): 52 | fname = get_pdf_path(doc.slug) 53 | try: 54 | pdf = pdfplumber.open(fname) 55 | except Exception: 56 | # If the file's not there, that's fine -- we use available PDFs to 57 | # define what to see 58 | logger.warn(f"Cannot open pdf {fname}") 59 | return 60 | 61 | logger.info(f"Rendering output for {fname}") 62 | 63 | # map class labels for visualizing W&B bounding boxes 64 | # TODO: use a type and separate out ground truth 65 | class_ids_by_field = { 66 | "gross_amount": 0, 67 | "flight_to": 1, 68 | "flight_from": 2, 69 | "contract_num": 3, 70 | "advertiser": 4, 71 | "ground_truth": 5, 72 | } 73 | class_id_to_label = {int(v): k for k, v in class_ids_by_field.items()} 74 | 75 | # visualize the first page of the document for which we have ground truth labels 76 | pagenum = int(doc.tokens[doc.labels > 0].page.min()) 77 | page = pdf.pages[pagenum] 78 | im = page.to_image(resolution=300) 79 | 80 | # loop over all predictions 81 | pred_bboxes = [] 82 | for i, score in enumerate(doc_log["score"]): 83 | rel_score = all_scores[:, i] / score 84 | page_match = doc.tokens.page == pagenum 85 | curr_field = doc_log["field"][i] 86 | 87 | # we could remove this threshold and rely entirely 88 | # on the wandb bbox dynamic threshold 89 | for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples(): 90 | pred_bboxes.append( 91 | wandb_bbox( 92 | token, 93 | score, 94 | class_ids_by_field[curr_field], 95 | im, 96 | ) 97 | ) 98 | # draw target tokens 99 | target_toks = doc.tokens[(doc.labels > 0) & (doc.tokens.page == 0)] 100 | true_bboxes = [wandb_bbox(t, 1, 5, im) for t in target_toks.itertuples()] 101 | 102 | boxes = { 103 | "predictions": { 104 | "box_data": pred_bboxes, 105 | "class_labels": class_id_to_label, 106 | }, 107 | "ground_truth": { 108 | "box_data": true_bboxes, 109 | "class_labels": class_id_to_label, 110 | }, 111 | } 112 | wandb.log({f"pdf/{fname.name}:{pagenum}": wandb.Image(im.annotated, boxes=boxes)}) 113 | 114 | 115 | def render_tokenized_pdf(doc): 116 | 117 | fname = get_pdf_path(doc.slug) 118 | try: 119 | pdf = pdfplumber.open(fname) 120 | except Exception: 121 | # If the file's not there, that's fine -- we use available PDFs to 122 | # define what to see 123 | print(f"Cannot open pdf {fname}") 124 | return 125 | 126 | page_images = [ 127 | {"image": page.to_image(resolution=300), "rects": [], "lines": []} 128 | for page in pdf.pages 129 | ] 130 | 131 | for token in doc.tokens.itertuples(): 132 | page_num = int(token.page) 133 | if page_num < len(page_images): 134 | page_images[page_num]["rects"].append(docrow_to_bbox(token)) 135 | 136 | for indices in np.argwhere(doc.adjacency_matrix): 137 | first_index, second_index = indices 138 | if first_index != second_index: 139 | first_token = doc.tokens.iloc[first_index] 140 | second_token = doc.tokens.iloc[second_index] 141 | page = int(first_token.page) 142 | line = ( 143 | (Decimal(float(first_token.x0)), Decimal(float(first_token.y1))), 144 | (Decimal(float(second_token.x0)), Decimal(float(second_token.y1))), 145 | ) 146 | page_images[page_num]["lines"].append(line) 147 | 148 | for page in page_images: 149 | image, rects, lines = page["image"], page["rects"], page["lines"] 150 | image.draw_rects(rects, stroke="blue", stroke_width=3, fill=None) 151 | print(f"first lines = {lines[:5]}") 152 | image.draw_lines(lines, stroke="green", stroke_width=3) 153 | 154 | return [page["image"] for page in page_images] 155 | 156 | 157 | def render_annotated_pdf(doc, score, scores, predict_text, answer_text): 158 | 159 | fname = get_pdf_path(doc.slug) 160 | try: 161 | pdf = pdfplumber.open(fname) 162 | except Exception: 163 | # If the file's not there, that's fine -- we use available PDFs to 164 | # define what to see 165 | print(f"Cannot open pdf {fname}") 166 | return 167 | 168 | print(f"Rendering output for {fname}") 169 | 170 | # Get the correct answers: find the indices of the token(s) labelled 1 171 | target_idx = [idx for (idx, val) in enumerate(doc.labels) if val == 1] 172 | 173 | # Draw the machine output: get a score for each token 174 | page_images = [] 175 | for pagenum, page in enumerate(pdf.pages): 176 | im = page.to_image(resolution=300) 177 | 178 | # training data has 0..1 for page range (see create-training-data.py) 179 | num_pages = len(pdf.pages) 180 | if num_pages > 1: 181 | current_page = pagenum / float(num_pages - 1) 182 | else: 183 | current_page = 0.0 184 | 185 | # Draw guesses 186 | rel_score = scores / score 187 | page_match = np.isclose(doc.tokens["page"], current_page) 188 | for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples(): 189 | if rel_score[token.Index] == 1: 190 | w = 5 191 | s = "magenta" 192 | elif rel_score[token.Index] >= 0.75: 193 | w = 3 194 | s = "red" 195 | else: 196 | w = 1 197 | s = "red" 198 | im.draw_rect(docrow_to_bbox(token), stroke=s, stroke_width=w, fill=None) 199 | 200 | # Draw target tokens 201 | target_toks = [ 202 | doc.tokens.iloc[i] 203 | for i in target_idx 204 | if np.isclose(doc.tokens.iloc[i]["page"], current_page) 205 | ] 206 | rects = [docrow_to_bbox(t) for t in target_toks] 207 | im.draw_rects(rects, stroke="blue", stroke_width=3, fill=None) 208 | page_images.append({"caption": f"page {pagenum}", "image": im.annotated}) 209 | 210 | # get best matching score of any token in the training data 211 | match = doc.tokens[SINGLE_CLASS_PREDICTION].max() 212 | caption = ( 213 | f"{doc.slug} guessed:{predict_text} answer:{answer_text} match:{match:.2f}" 214 | ) 215 | verdict = dollar_match(predict_text, answer_text) 216 | 217 | if dollar_match(predict_text, answer_text): 218 | caption = "CORRECT " + caption 219 | else: 220 | caption = "INCORRECT " + caption 221 | return verdict, caption, page_images 222 | 223 | 224 | def log_pdf(doc, score, scores, predict_text, answer_text): 225 | caption, page_images = render_annotated_pdf(doc, score, predict_text, answer_text) 226 | page_images = [ 227 | wandb.Image(page_image["image"], page_image["caption"]) 228 | for page_image in page_images 229 | ] 230 | wandb.log({caption: page_images}) 231 | -------------------------------------------------------------------------------- /deepform/train.py: -------------------------------------------------------------------------------- 1 | # Data extraction by deep learning, using a fully connected architecture over 2 | # token windows. Engineered to extract total amounts, using a few custom 3 | # features. 4 | # Achieves up to 90% accuracy. 5 | # 6 | # jstray 2019-6-12 7 | 8 | import argparse 9 | import os 10 | import re 11 | from collections import defaultdict 12 | from datetime import datetime 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import tensorflow as tf 17 | import wandb 18 | from tensorflow import keras as K 19 | from wandb.keras import WandbCallback 20 | 21 | from deepform.common import LOG_DIR, TRAINING_INDEX, WANDB_PROJECT 22 | from deepform.data.add_features import LABEL_COLS 23 | from deepform.document_store import DocumentStore 24 | from deepform.logger import logger 25 | from deepform.model import create_model, save_model, windowed_generator 26 | from deepform.pdfs import log_wandb_pdfs 27 | from deepform.util import config_desc, date_match, dollar_match, loose_match 28 | 29 | 30 | # Calculate accuracy of answer extraction over num_to_test docs, print 31 | # diagnostics while we do so 32 | def compute_accuracy(model, config, dataset, num_to_test, print_results, log_path): 33 | n_print = config.render_results_size 34 | 35 | n_docs = min(num_to_test, len(dataset)) 36 | 37 | accuracies = defaultdict(int) 38 | 39 | for doc in sorted(dataset.sample(n_docs), key=lambda d: d.slug): 40 | slug = doc.slug 41 | answer_texts = doc.label_values 42 | 43 | predict_texts, predict_scores, all_scores = doc.predict_answer( 44 | model, config.predict_thresh 45 | ) 46 | answer_texts = [answer_texts[c] for c in LABEL_COLS.keys()] 47 | 48 | doc_output = doc.show_predictions(predict_texts, predict_scores, all_scores) 49 | # path = log_path / ("right" if match else "wrong") 50 | log_path.mkdir(parents=True, exist_ok=True) 51 | with open(log_path / f"{slug}.txt", "w") as predict_file: 52 | predict_file.write(doc_output) 53 | 54 | if print_results: 55 | print(f"file_id:{slug}") 56 | 57 | # track all logging information for this document 58 | doc_log = defaultdict(list) 59 | for i, (field, answer_text) in enumerate(doc.label_values.items()): 60 | predict_text = predict_texts[i] 61 | predict_score = predict_scores[i] 62 | doc_log["true_text"].append(answer_text) 63 | doc_log["pred_text"].append(predict_text) 64 | doc_log["score"].append(predict_score) 65 | doc_log["field"].append(field) 66 | 67 | match = ( 68 | (predict_score < config.predict_thresh and not answer_text) 69 | or loose_match(predict_text, answer_text) 70 | or (field == "gross_amount" and dollar_match(predict_text, answer_text)) 71 | or ( 72 | field in ("flight_from", "flight_to") 73 | and date_match(predict_text, answer_text) 74 | ) 75 | ) 76 | 77 | accuracies[field] += match 78 | 79 | prefix = "✔️" if match else "❌" 80 | guessed = f'guessed "{predict_text}" with score {predict_score:.3f}' 81 | correction = "" if match else f', was actually "{answer_text}"' 82 | doc_log["match"].append(match) 83 | if print_results: 84 | print(f"\t{prefix} {field}: {guessed}{correction}") 85 | if print_results and n_print > 0: 86 | log_wandb_pdfs( 87 | doc, doc_log, all_scores 88 | ) # TODO: get fields here more explicitly? 89 | n_print -= 1 90 | return pd.Series(accuracies) / n_docs 91 | 92 | 93 | # ---- Custom callback to log document-level accuracy ---- 94 | class DocAccCallback(K.callbacks.Callback): 95 | def __init__(self, config, run_timestamp, dataset, logname): 96 | self.config = config 97 | self.dataset = dataset 98 | self.logname = logname 99 | self.log_path = LOG_DIR / "predictions" / run_timestamp 100 | 101 | def on_epoch_end(self, epoch, logs): 102 | if epoch >= self.config.epochs - 1: 103 | # last epoch, sample from all docs and print inference results 104 | print_results = self.logname == "doc_val_acc" 105 | test_size = len(self.dataset) 106 | else: 107 | # intermediate epoch, small sample and no logger 108 | print_results = False 109 | test_size = self.config.doc_acc_sample_size + epoch 110 | 111 | # Avoid sampling tens of thousands of documents on large training sets. 112 | test_size = min(test_size, self.config.doc_acc_max_sample_size) 113 | 114 | kind = "test" if self.logname == "doc_val_acc" else "train" 115 | 116 | acc = compute_accuracy( 117 | self.model, 118 | self.config, 119 | self.dataset, 120 | test_size, 121 | print_results, 122 | self.log_path / kind / f"{epoch:02d}", 123 | ) 124 | acc_str = re.sub(r"\s+", " ", acc.to_string()) 125 | print(f"This epoch {self.logname}: {acc_str}") 126 | 127 | # convert field names for benchmark logging 128 | wandb.log( 129 | acc.rename( 130 | {"gross_amount": "amount", "contract_num": "contractid"} 131 | ).to_dict() 132 | ) 133 | 134 | # compute average accuracy 135 | wandb.log({"avg_acc": acc.mean(), "epoch": epoch}) 136 | 137 | 138 | def main(config): 139 | config.name = config_desc(config) 140 | if config.use_wandb: 141 | run.save() 142 | 143 | # set random seed 144 | tf.random.set_seed(config.random_seed) 145 | # also set numpy seed to control train/val dataset split 146 | np.random.seed(config.random_seed) 147 | 148 | print("Configuration:") 149 | print("{\n\t" + ",\n\t".join(f"'{k}': {v}" for k, v in config.items()) + "\n}") 150 | 151 | run_ts = datetime.now().isoformat(timespec="seconds").replace(":", "") 152 | 153 | # all_data = load_training_data(config) 154 | all_documents = DocumentStore.open(index_file=TRAINING_INDEX, config=config) 155 | 156 | # split into validation and training sets 157 | validation_set, training_set = all_documents.split(val_percent=config.val_split) 158 | print(f"Training on {len(training_set)}, validating on {len(validation_set)}") 159 | 160 | model = create_model(config) 161 | print(model.summary()) 162 | 163 | callbacks = [WandbCallback()] if config.use_wandb else [] 164 | callbacks.append(K.callbacks.LambdaCallback(on_epoch_end=lambda *args: print())) 165 | callbacks.append(DocAccCallback(config, run_ts, training_set, "doc_train_acc")) 166 | callbacks.append(DocAccCallback(config, run_ts, validation_set, "doc_val_acc")) 167 | 168 | model.fit( 169 | windowed_generator(training_set, config), 170 | steps_per_epoch=config.steps_per_epoch, 171 | epochs=config.epochs, 172 | callbacks=callbacks, 173 | ) 174 | 175 | if config.save_model: 176 | model_filepath = save_model(model, config) 177 | alias = model_filepath.name 178 | artifact = wandb.Artifact( 179 | "deepform-model", type="model", metadata={"name": alias} 180 | ) 181 | artifact.add_dir( 182 | str(model_filepath) 183 | ) # TODO: check that this is necessary? What does wandb api expect here? 184 | run.log_artifact(artifact, aliases=["latest", alias]) 185 | 186 | 187 | if __name__ == "__main__": 188 | # First read in the initial configuration. 189 | os.environ["WANDB_CONFIG_PATHS"] = "config-defaults.yaml" 190 | run = wandb.init( 191 | project=WANDB_PROJECT, 192 | job_type="train", 193 | allow_val_change=True, 194 | ) 195 | config = run.config 196 | # Then override it with any parameters passed along the command line. 197 | parser = argparse.ArgumentParser() 198 | 199 | # Anything in the config is fair game to be overridden by a command line flag. 200 | for key, value in config.items(): 201 | cli_flag = f"--{key}".replace("_", "-") 202 | parser.add_argument(cli_flag, dest=key, type=type(value), default=value) 203 | 204 | args = parser.parse_args() 205 | config.update(args, allow_val_change=True) 206 | 207 | if not config.use_wandb: 208 | os.environ["WANDB_SILENT"] = "true" 209 | os.environ["WANDB_MODE"] = "dryrun" 210 | wandb.log = lambda *args, **kwargs: None 211 | 212 | logger.setLevel(config.log_level) 213 | 214 | main(config) 215 | -------------------------------------------------------------------------------- /deepform/util.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import re 4 | import subprocess 5 | from collections import namedtuple 6 | from datetime import datetime 7 | from decimal import Decimal, InvalidOperation 8 | 9 | import scipy.sparse as sparse 10 | from fuzzywuzzy import fuzz 11 | 12 | from deepform.logger import logger 13 | 14 | BoundingBox = namedtuple("BoundingBox", ["x0", "y0", "x1", "y1"]) 15 | 16 | _whitespace = re.compile(r"\s") 17 | 18 | 19 | def simple_string(s): 20 | """Lowercase and remove whitespace from a string.""" 21 | return _whitespace.sub("", s.casefold()) if isinstance(s, str) else "" 22 | 23 | 24 | def num_digits(s): 25 | return sum(c.isdigit() for c in s) 26 | 27 | 28 | def loose_match(s1, s2): 29 | """Match two strings irrespective of case and whitespace.""" 30 | return simple_string(s1) == simple_string(s2) 31 | 32 | 33 | def default_similarity(lhs, rhs): 34 | return fuzz.ratio(simple_string(lhs), simple_string(rhs)) / 100 35 | 36 | 37 | def is_dollar_amount(s): 38 | try: 39 | return num_digits(s) > 0 and bool(re.match(r"^\$?\d*(,\d\d\d)*(\.\d\d)?$", s)) 40 | except TypeError: 41 | return False 42 | 43 | 44 | def dollar_amount(s): 45 | if is_dollar_amount(s): 46 | try: 47 | return float(s.replace("$", "").replace(",", "")) 48 | except ValueError: 49 | logger.error(f"'{s}' could not be converted to a dollar amount.") 50 | return None 51 | 52 | 53 | def dollar_similarity(lhs, rhs): 54 | lh_dollar, rh_dollar = normalize_dollars(lhs), normalize_dollars(rhs) 55 | if lh_dollar and rh_dollar: 56 | return fuzz.ratio(lh_dollar, rh_dollar) / 100 57 | return default_similarity(lhs, rhs) 58 | 59 | 60 | def log_dollar_amount(s): 61 | """Return the logarithm of 1 + a non-negative dollar amount.""" 62 | d = dollar_amount(s) 63 | return math.log(d + 1) if d and d > 0 else None 64 | 65 | 66 | def normalize_dollars(s) -> str: 67 | """Return a string of a number rounded to two digits (or None if not possible). 68 | 69 | Given a string like '$56,333.1' return the string '56333.10'. 70 | """ 71 | try: 72 | return str(round(Decimal(str(s).replace("$", "").replace(",", "")), 2)) 73 | except InvalidOperation: 74 | return None 75 | 76 | 77 | def dollar_match(predicted, actual): 78 | """Best-effort matching of dollar amounts, e.g. '$14,123.02' to '14123.02'.""" 79 | return ( 80 | is_dollar_amount(predicted) 81 | and is_dollar_amount(actual) 82 | and (normalize_dollars(predicted) == normalize_dollars(actual)) 83 | ) 84 | 85 | 86 | date_formats = { 87 | # If a string matches the regex key, it can be passed to strptime() 88 | # with the respective format string. Ordered from most to least common. 89 | re.compile(r"^[01]?\d/[0123]?\d/\d\d$"): "%m/%d/%y", 90 | re.compile(r"^[01]?\d/[0123]?\d/20\d\d$"): "%m/%d/%Y", 91 | re.compile(r"^[a-z]{3}\d?\d/\d\d$"): "%b%d/%y", 92 | re.compile(r"^[a-z]{3}\d?\d/20\d\d$"): "%b%d/%Y", 93 | re.compile(r"^[a-z]{4,9}\d?\d/\d\d$"): "%B%d/%y", 94 | re.compile(r"^[a-z]{4,9}\d?\d/20\d\d$"): "%B%d/%Y", 95 | } 96 | _time_punc = re.compile(r"[-,\\]") 97 | _no_year = re.compile(r"^[01]?\d/[0123]?\d$") 98 | 99 | 100 | def normalize_date(s): 101 | """Turn a string in a common date format into a date.""" 102 | try: 103 | if num_digits(s) == 0: 104 | return None 105 | # Turn dashes, commas and back slashes into forward slashes. 106 | s = _time_punc.sub("/", simple_string(s)) 107 | # Check the string against each possible date format. 108 | for date_regex, strp_format in date_formats.items(): 109 | if date_regex.match(s): 110 | return datetime.strptime(s, strp_format).date() 111 | if _no_year.match(s): 112 | # If no date is present, assume 2020. 113 | return datetime.strptime(s + "/20", "%m/%d/%y").date() 114 | except (TypeError, ValueError): 115 | return None 116 | 117 | 118 | def date_similarity(lhs, rhs): 119 | lh_date, rh_date = normalize_date(lhs), normalize_date(rhs) 120 | if lh_date and rh_date and lh_date == rh_date: 121 | return 1 122 | return default_similarity(lhs, rhs) 123 | 124 | 125 | def date_match(predicted, actual): 126 | """Best-effort matching of dates, e.g. '02-03-2020' to '2/3/20'.""" 127 | lhs, rhs = normalize_date(predicted), normalize_date(actual) 128 | return bool(lhs and rhs and lhs == rhs) 129 | 130 | 131 | def any_similarity(lhs, rhs): 132 | return max(dollar_similarity(lhs, rhs), date_similarity(lhs, rhs)) 133 | 134 | 135 | def any_match(lhs, rhs): 136 | return loose_match(lhs, rhs) or dollar_match(lhs, rhs) or date_match(lhs, rhs) 137 | 138 | 139 | def docrow_to_bbox(t, min_height=10): 140 | """Create the array pdfplumber expects for bounding boxes from an input namedtuple. 141 | 142 | If `min_height` is set, adjust the minimum size of the bounding boxes to fix the 143 | cases where pdfplumber has incorrectly underlined rather than boxed in the 144 | recognized text. 145 | """ 146 | dims = {k: Decimal(float(getattr(t, k))) for k in ["x0", "y0", "x1", "y1"]} 147 | if min_height: 148 | dims["y0"] = min(dims["y1"] - Decimal(min_height), dims["y0"]) 149 | return BoundingBox(**dims) 150 | 151 | 152 | def wandb_bbox(t, score, class_id, pdf_page, min_height=10): 153 | """Prototype logging bounding boxes to W&B. Currently W&B assumes a fixed 154 | single size for each image logged, so this requires resizing all logged documents 155 | to see correct bounding boxes""" 156 | dims = docrow_to_bbox(t, min_height) 157 | 158 | # reproject bounding box into pdf image 159 | x0, y0 = pdf_page._reproject((dims.x0, dims.y0)) 160 | x1, y1 = pdf_page._reproject((dims.x1, dims.y1)) 161 | 162 | box_data = { 163 | "position": { 164 | "minX": float(x0), 165 | "minY": float(y0), 166 | "maxX": float(x1), 167 | "maxY": float(y1), 168 | }, 169 | "class_id": class_id, 170 | "domain": "pixel", 171 | "scores": {"score": score}, 172 | "box_caption": "%.3f" % score, 173 | } 174 | return box_data 175 | 176 | 177 | def config_desc(config): 178 | """A one-line text string describing the configuration of a run.""" 179 | return ( 180 | "len:{len_train} " 181 | "win:{window_len} " 182 | "str:{use_string} " 183 | "page:{use_page} " 184 | "geom:{use_geom} " 185 | "amt:{use_amount} " 186 | "voc:{vocab_size} " 187 | "emb:{vocab_embed_size} " 188 | "steps:{steps_per_epoch}" 189 | ).format(**config) 190 | 191 | 192 | def sample(items, n=None, seed=None): 193 | """Get a sample of `n` items without replacement. 194 | 195 | If n is None, return the input after shuffling it. 196 | """ 197 | if seed: 198 | random.seed(seed) 199 | if n is None: 200 | n = len(items) 201 | return random.sample(items, k=n) 202 | 203 | 204 | def git_short_hash(): 205 | try: 206 | out = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]) 207 | return out.strip().decode("ascii") 208 | except (OSError, subprocess.CalledProcessError): 209 | return "UnknownGitRevsion" 210 | 211 | 212 | def pad_sparse_matrix(m, pad_rows=0, pad_columns=0): 213 | (rows, _) = m.get_shape() 214 | column_padding = sparse.coo_matrix((rows, pad_columns)) 215 | padded_columns = sparse.hstack([column_padding, m, column_padding]) 216 | (_, columns) = padded_columns.get_shape() 217 | row_padding = sparse.coo_matrix((pad_rows, columns)) 218 | padded_rows = sparse.vstack([row_padding, padded_columns, row_padding]) 219 | return padded_rows.tocoo() 220 | -------------------------------------------------------------------------------- /init_sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Source this script in order to invoke wandb sweep sweep.yaml and set the var WANDB_SWEEP_ID 3 | 4 | export SED_REGEX_EXTRACT='s/^.*Created sweep with ID: \([[:alnum:]]*\).*$/\1/p' 5 | init=$(wandb sweep sweep.yaml 2>&1 | sed -n "$SED_REGEX_EXTRACT") 6 | 7 | if [ -z "$init" ] 8 | then 9 | exit 1 10 | else 11 | echo $init 12 | export WANDB_SWEEP_ID="$init" 13 | wandb agent deepform/deepform/$WANDB_SWEEP_ID 14 | fi 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "deepform" 3 | version = "0.1.1" 4 | description = "extract information from TV station political advertising disclosure forms" 5 | authors = ["Jonathan Stray "] 6 | 7 | [tool.poetry.dependencies] 8 | boto3 = "^1.14.39" 9 | fuzzywuzzy = {extras = ["speedup"], version = "^0.18.0"} 10 | humanize = "^3.0.0" 11 | joblib = "^0.16.0" 12 | keras = "^2.4.3" 13 | numpy = "^1.18.5" 14 | pandas = "^1.1.2" 15 | pdfplumber = "^0.5.23" 16 | pyarrow = "^1.0.1" 17 | python = "^3.8.1" 18 | sqlalchemy = "^1.3.18" 19 | tensorflow = "^2.3.1" 20 | tqdm = "^4.48.2" 21 | wandb = "0.10.4" 22 | spektral = "^0.6.2" 23 | 24 | [tool.poetry.dev-dependencies] 25 | autoflake = "^1.3.1" 26 | babel = "^2.8.0" 27 | black = "^20.8b1" 28 | faker = "^4.1.1" 29 | flake8 = "^3.8.3" 30 | hypothesis = "^5.24.0" 31 | isort = "^5.5.4" 32 | matplotlib = "^3.3.0" 33 | pre-commit = "^2.6.0" 34 | pylint = "^2.5.3" 35 | pytest = "^6.1.0" 36 | jupyterlab = "^2.2.8" 37 | 38 | [tool.isort] 39 | profile = "black" 40 | src_paths = ["deepform", "test"] 41 | known_third_party = ["boto3", "botocore", "fuzzywuzzy", "joblib", "keras", "nltk", "numpy", "pandas", "pdfplumber", "pyarrow", "sqlalchemy", "tensorflow", "tqdm", "wandb"] 42 | 43 | [tool.black] 44 | line-length = 88 45 | target-version = ['py38'] 46 | 47 | [build-system] 48 | requires = ["poetry-core>=1.0.0"] 49 | build-backend = "poetry.core.masonry.api" 50 | -------------------------------------------------------------------------------- /source/README.md: -------------------------------------------------------------------------------- 1 | ftf-all-filings.tsv is the crowdsourced data entered by volunteers in 2012. 2 | The dc-slug is can be used to get a URL for the original PDF; see download-pdfs.py 3 | 4 | Originally from https://www.propublica.org/datastore/dataset/free-the-files-filing-data 5 | -------------------------------------------------------------------------------- /sweep.yaml: -------------------------------------------------------------------------------- 1 | # sweep.yaml 2 | program: deepform/train.py 3 | project: deepform 4 | method: bayes 5 | metric: 6 | name: doc_val_acc #acc #val_acc 7 | goal: maximize 8 | parameters: 9 | steps_per_epoch: 10 | values: [10, 25, 50] 11 | #max: 50 12 | use_string: 13 | values: [0, 1] 14 | use_page: 15 | values: [0, 1] 16 | use_geom: 17 | values: [0, 1] 18 | use_amount: 19 | values: [0, 1] 20 | window_len: 21 | values: [10, 25, 50] 22 | epochs: 23 | values: [50, 100] 24 | steps_per_epoch: 25 | values: [10, 25, 50] 26 | len_train: 27 | values: [100, 200] 28 | vocab_size: 29 | values: [100, 500, 2000] 30 | vocab_embed_size: 31 | values: [16, 32, 64] 32 | early_terminate: 33 | #min_iter #specify the iteration for the first bracket 34 | type: hyperband 35 | s: 2 #specify the total number of brackets (required for max_iter) 36 | eta: 3 #specify the bracket multiplier schedule (default: 3) 37 | max_iter: 27 #specify the maximum number of iterations for the program 38 | -------------------------------------------------------------------------------- /tests/test_add_features.py: -------------------------------------------------------------------------------- 1 | from datetime import date, timedelta 2 | 3 | import pandas as pd 4 | from babel.numbers import format_currency 5 | 6 | from deepform.data.add_features import ( 7 | extend_and_write_docs, 8 | fraction_digits, 9 | pq_index_and_dir, 10 | ) 11 | from deepform.data.create_vocabulary import get_token_id 12 | from deepform.util import is_dollar_amount, log_dollar_amount 13 | 14 | COL_TYPES = { 15 | "page": "f4", # 32-bit float. 16 | "x0": "f4", 17 | "y0": "f4", 18 | "x1": "f4", 19 | "y1": "f4", 20 | "token": "string", # Pandas 1.x string type. 21 | } 22 | 23 | 24 | def random_dollar_amount(faker): 25 | amount = round(faker.pyfloat(min_value=0, max_value=100000), 2) 26 | return format_currency(amount, "USD", locale="en_US") 27 | 28 | 29 | def random_date(faker, start_date=date(2020, 1, 1), end_date=date(2020, 12, 31)): 30 | days = (end_date - start_date).days 31 | day = faker.pyint(min_value=0, max_value=days) 32 | return start_date + timedelta(days=day) 33 | 34 | 35 | def random_training_data_row(faker): 36 | x0 = faker.pyfloat(min_value=-1, max_value=600) 37 | y0 = faker.pyfloat(min_value=-1, max_value=750) 38 | return { 39 | "page": faker.pyfloat(min_value=0, max_value=1), 40 | "x0": x0, 41 | "y0": y0, 42 | "x1": x0 + faker.pyfloat(min_value=-1, max_value=20), 43 | "y1": y0 + faker.pyfloat(min_value=-1, max_value=50), 44 | "token": faker.pystr(min_chars=1, max_chars=50), 45 | } 46 | 47 | 48 | def random_doc_data(faker): 49 | num_tokens = faker.pyint(min_value=1, max_value=500) 50 | df = pd.DataFrame([random_training_data_row(faker) for _ in range(num_tokens)]) 51 | return df.astype(COL_TYPES) 52 | 53 | 54 | def create_tokens_and_manifest(faker, src_path, manifest_path, num_docs=5): 55 | src_path.mkdir(parents=True, exist_ok=True) 56 | 57 | docs = {faker.slug(): random_doc_data(faker) for _ in range(num_docs)} 58 | manifest = [] 59 | 60 | for slug, doc in docs.items(): 61 | doc.to_parquet(src_path / f"{slug}.parquet", index=False) 62 | manifest.append( 63 | { 64 | "file_id": slug, 65 | "contract_num": faker.isbn10(), 66 | "advertiser": faker.company(), 67 | "flight_from": random_date(faker), 68 | "flight_to": random_date(faker), 69 | "gross_amount": random_dollar_amount(faker), 70 | } 71 | ) 72 | 73 | return pd.DataFrame(manifest) 74 | 75 | 76 | def test_add_features_to_labeled_parquet(faker, tmp_path): 77 | num_docs = 5 78 | src_path = tmp_path / "tokenized" 79 | manifest = create_tokens_and_manifest(faker, src_path, num_docs) 80 | 81 | idx_path = tmp_path / "doc_index.parquet" 82 | idx_path, pq_path = pq_index_and_dir(idx_path) 83 | 84 | # Run the conversion code. 85 | extend_and_write_docs(src_path, manifest, idx_path, pq_path, 1) 86 | 87 | # Check out the index. 88 | index = pd.read_parquet(idx_path) 89 | 90 | assert len(index) == num_docs 91 | assert set(manifest.file_id) == set(index.index) 92 | 93 | # Check out each individual document that was produced. 94 | for row in index.itertuples(): 95 | doc = pd.read_parquet(pq_path / f"{row.Index}.parquet") 96 | # Doc features 97 | assert doc.token.str.len().min() >= 3 98 | assert row.length == len(doc) 99 | assert row.best_match_gross_amount == doc.gross_amount.max() 100 | assert row.best_match_contract_num == doc.contract_num.max() 101 | 102 | # Row features 103 | assert (doc.tok_id == doc.token.apply(get_token_id)).all() 104 | assert (doc.length == doc.token.str.len()).all() 105 | assert (doc.digitness == doc.token.apply(fraction_digits)).all() 106 | assert (doc.is_dollar == doc.token.apply(is_dollar_amount)).all() 107 | assert (doc.log_amount == doc.token.apply(log_dollar_amount).fillna(0)).all() 108 | -------------------------------------------------------------------------------- /tests/test_graph_geometry.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from deepform.data.graph_geometry import document_edges 5 | 6 | # ASCII Art of Test Example 7 | # 8 | # A --- B --- C 9 | # | W--|--X | 10 | # D -|- E -|- F 11 | # | Y -|- Z | 12 | # G --- H --- I 13 | 14 | A = {"token": "A", "x0": 1, "y1": 1, "page": 0} 15 | B = {"token": "B", "x0": 3, "y1": 1, "page": 0} 16 | C = {"token": "C", "x0": 5, "y1": 1, "page": 0} 17 | D = {"token": "D", "x0": 1, "y1": 3, "page": 0} 18 | E = {"token": "E", "x0": 3, "y1": 3, "page": 0} 19 | F = {"token": "F", "x0": 5, "y1": 3, "page": 0} 20 | G = {"token": "G", "x0": 1, "y1": 5, "page": 0} 21 | H = {"token": "H", "x0": 3, "y1": 5, "page": 0} 22 | I = {"token": "I", "x0": 5, "y1": 5, "page": 0} # noqa: E741 23 | W = {"token": "W", "x0": 2, "y1": 2, "page": 0} 24 | X = {"token": "X", "x0": 4, "y1": 2, "page": 0} 25 | Y = {"token": "Y", "x0": 2, "y1": 4, "page": 0} 26 | Z = {"token": "Z", "x0": 4, "y1": 4, "page": 0} 27 | 28 | tokens = pd.DataFrame.from_records([A, B, C, D, E, F, G, H, I, W, X, Y, Z]) 29 | 30 | # Manually construct the sparse matrix of edges for the above example. 31 | edges = np.zeros((13, 13)) 32 | 33 | edges[0, 1] = True # A B 34 | edges[1, 2] = True # B C 35 | edges[3, 4] = True # D E 36 | edges[4, 5] = True # E F 37 | edges[6, 7] = True # G H 38 | edges[7, 8] = True # H I 39 | edges[0, 3] = True # A D 40 | edges[3, 6] = True # D G 41 | edges[1, 4] = True # B E 42 | edges[4, 7] = True # E H 43 | edges[2, 5] = True # C F 44 | edges[5, 8] = True # F I 45 | edges[9, 10] = True # W X 46 | edges[11, 12] = True # Y Z 47 | edges[9, 11] = True # W Y 48 | edges[10, 12] = True # X Z 49 | 50 | # Add in the symmetric relationships 51 | edges = edges + edges.T 52 | 53 | adjacency = document_edges(tokens).todense() 54 | expected = edges 55 | 56 | 57 | def test_9x9_adjacency(): 58 | adjacency9x9 = adjacency[0:9, 0:9] 59 | expected9x9 = expected[0:9, 0:9] 60 | assert (adjacency9x9 == expected9x9).all() 61 | 62 | 63 | def test_4x4_adjacency(): 64 | adjacency4x4 = adjacency[9:, 9:] 65 | expected4x4 = expected[9:, 9:] 66 | assert (adjacency4x4 == expected4x4).all() 67 | 68 | 69 | def test_disconnected(): 70 | disconnectedRight = adjacency[9:, 0:9] 71 | disconnectedBottom = adjacency[0:9, 9:] 72 | assert (disconnectedRight == 0).all() 73 | assert (disconnectedBottom == 0).all() 74 | 75 | 76 | def test_different_pages(): 77 | B_pg_2 = B.copy() 78 | B_pg_2["page"] = 1 79 | tokens_pages = pd.DataFrame.from_records([A, B_pg_2, C]) 80 | 81 | adjacency = document_edges(tokens_pages).todense() 82 | assert not adjacency[0, 1] 83 | assert adjacency[0, 2] 84 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from decimal import Decimal 3 | from math import isclose 4 | 5 | import hypothesis.strategies as st 6 | import scipy.sparse as sparse 7 | from hypothesis import example, given 8 | 9 | from deepform.util import ( 10 | BoundingBox, 11 | docrow_to_bbox, 12 | dollar_amount, 13 | is_dollar_amount, 14 | log_dollar_amount, 15 | normalize_date, 16 | normalize_dollars, 17 | pad_sparse_matrix, 18 | ) 19 | 20 | 21 | def test_is_dollar_amount(): 22 | assert is_dollar_amount("$10") 23 | assert is_dollar_amount("$15.00") 24 | assert is_dollar_amount("$2.03") 25 | assert is_dollar_amount("3") 26 | assert is_dollar_amount("04") 27 | assert is_dollar_amount("9,000") 28 | assert not is_dollar_amount("") 29 | assert not is_dollar_amount("$") 30 | assert not is_dollar_amount(",") 31 | assert not is_dollar_amount(".") 32 | assert not is_dollar_amount("$,") 33 | assert not is_dollar_amount("$.") 34 | assert not is_dollar_amount("C") 35 | assert not is_dollar_amount("$x") 36 | assert not is_dollar_amount("3 .17") 37 | 38 | 39 | def test_dollar_amount(): 40 | assert dollar_amount("$10") == 10 41 | assert dollar_amount("$15.00") == 15 42 | assert dollar_amount("$2.03") == 2.03 43 | assert dollar_amount("3") == 3 44 | assert dollar_amount("04") == 4 45 | assert dollar_amount("9,000") == 9000 46 | assert dollar_amount("") is None 47 | assert dollar_amount("C") is None 48 | assert dollar_amount("$x") is None 49 | assert dollar_amount("3 .17") is None 50 | 51 | 52 | @given(st.text()) 53 | @example("$.01") 54 | @example("$6.010.01") 55 | @example("$3,020,01") 56 | def test_dollar_amount_accepts_arbitratry_strings(s): 57 | if not is_dollar_amount(s): 58 | assert dollar_amount(s) is None 59 | else: 60 | assert normalize_dollars(s) is not None 61 | n = dollar_amount(s) 62 | assert normalize_dollars(str(n)) == normalize_dollars(s) 63 | 64 | 65 | @given(st.text()) 66 | @example("0.02") 67 | @example("-1") 68 | @example("$-0.5") 69 | def test_log_dollar_amount_accepts_arbitratry_strings(s): 70 | if is_dollar_amount(s) and dollar_amount(s) > 0: 71 | assert log_dollar_amount(s) > 0 72 | else: 73 | assert log_dollar_amount(s) is None 74 | 75 | 76 | def test_normalize_dollars(): 77 | assert normalize_dollars("0") == "0.00" 78 | assert normalize_dollars("$10") == "10.00" 79 | assert normalize_dollars("$15.00") == "15.00" 80 | assert normalize_dollars("$2.03") == "2.03" 81 | assert normalize_dollars("3") == "3.00" 82 | assert normalize_dollars("04") == "4.00" 83 | assert normalize_dollars("9,000") == "9000.00" 84 | assert normalize_dollars("") is None 85 | assert normalize_dollars("C") is None 86 | assert normalize_dollars("$x") is None 87 | assert normalize_dollars("3 .17") is None 88 | 89 | 90 | def test_normalize_date(): 91 | assert normalize_date("03/12/20") == date(2020, 3, 12) 92 | assert normalize_date("3/4/19") == date(2019, 3, 4) 93 | assert normalize_date("6-1") == date(2020, 6, 1) 94 | assert normalize_date("4-28-21") == date(2021, 4, 28) 95 | assert normalize_date("Apr16/20") == date(2020, 4, 16) 96 | assert normalize_date("DEC30/19") == date(2019, 12, 30) 97 | assert normalize_date("February 12, 2020") == date(2020, 2, 12) 98 | assert normalize_date("11/20") == date(2020, 11, 20) 99 | assert normalize_date("22") is None 100 | assert normalize_date("") is None 101 | assert normalize_date(None) is None 102 | 103 | 104 | coord = st.floats(min_value=-10, max_value=800, allow_nan=False) 105 | height = st.floats(min_value=0, max_value=100) 106 | 107 | 108 | @given(x0=coord, y0=coord, x1=coord, y1=coord, mh=height) 109 | def test_docrow_to_bbox(x0, y0, x1, y1, mh): 110 | t = BoundingBox(x0=x0, x1=x1, y0=y0, y1=y1) 111 | bbox0 = docrow_to_bbox(t, min_height=None) 112 | bbox1 = docrow_to_bbox(t) 113 | bbox2 = docrow_to_bbox(t, min_height=mh) 114 | for box in (bbox0, bbox1, bbox2): 115 | assert box.x0 == Decimal(x0) 116 | assert box.x1 == Decimal(x1) 117 | assert box.y1 == Decimal(y1) 118 | assert bbox0.y0 == Decimal(y0) 119 | # Floating point arithmetic, yo. 120 | assert bbox1.y1 - bbox1.y0 >= 10 or isclose(bbox1.y1 - bbox1.y0, 10) 121 | assert bbox2.y1 - bbox2.y0 >= mh or isclose(bbox2.y1 - bbox2.y0, mh) 122 | 123 | 124 | def test_sparse_padding(): 125 | m = sparse.identity(3) 126 | padded = pad_sparse_matrix(m, 1, 1).todense() 127 | assert padded.shape == (5, 5) 128 | assert padded[0, 0] == 0 129 | assert padded[1, 1] == 1 130 | --------------------------------------------------------------------------------