├── .dockerignore
├── .flake8
├── .github
    └── workflows
    │   ├── docker-image.yml
    │   └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── config-defaults.yaml
├── data
    ├── 2012_manifest.tsv
    ├── 2014_manifest.tsv
    ├── 2020_manifest.csv
    ├── 3_year_manifest.csv
    ├── create-training-data.py
    ├── fcc-data-2020-labeled-manifest.csv
    └── token_frequency.csv
├── deepform
    ├── __init__.py
    ├── artifacts.py
    ├── combine_manifests.py
    ├── common.py
    ├── data
    │   ├── __init__.py
    │   ├── add_features.py
    │   ├── create_vocabulary.py
    │   ├── graph_geometry.py
    │   └── tokenize_pdfs.py
    ├── db
    │   ├── .env
    │   ├── README.md
    │   ├── __init__.py
    │   ├── conf
    │   │   └── config-file.cnf
    │   ├── scripts
    │   │   ├── create_schema.sql
    │   │   ├── load_document_data.sql
    │   │   └── load_token_data.sql
    │   └── source.py
    ├── document.py
    ├── document_store.py
    ├── features.py
    ├── infer.py
    ├── logger.py
    ├── model.py
    ├── pdfs.py
    ├── train.py
    └── util.py
├── init_sweep.sh
├── poetry.lock
├── pyproject.toml
├── source
    ├── README.md
    └── ftf-all-filings.tsv
├── sweep.yaml
└── tests
    ├── test_add_features.py
    ├── test_graph_geometry.py
    └── test_util.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Ignore everything, only make exceptions for the files we know we want.
 2 | *
 3 | 
 4 | # Whitelisted exceptions.
 5 | !pyproject.toml
 6 | !poetry.lock
 7 | !deepform/
 8 | !tests/
 9 | !*.yaml
10 | !init_sweep.sh
11 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = D203,E203,W503
3 | exclude = __pycache__,.hypothesis,.ipynb_checkpoints,wandb,old,docs/source/conf.py,old
4 | max-line-length = 88
5 | max-complexity = 10
6 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Build the Docker image
18 |       run: docker build . --file Dockerfile --tag deepform_learner:$(date +%s)
19 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on: [push]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.8
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.8
18 | 
19 |       - name: Install lint tools
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install black flake8
23 | 
24 |       - name: Check formatting with black
25 |         run: |
26 |           black . --check
27 | 
28 |       - name: Lint with flake8
29 |         run: |
30 |           # stop the build if there are Python syntax errors or undefined names
31 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 | 
35 |       - name: Install project
36 |         run: |
37 |           pip install poetry==1.0.10
38 |           poetry config virtualenvs.create false
39 |           poetry install --no-interaction --no-ansi
40 | 
41 |       - name: Test with pytest
42 |         run: |
43 |           pytest
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs/
 3 | wandb/
 4 | 
 5 | # Caches
 6 | cache/
 7 | __pycache__
 8 | *.egg-info
 9 | source/cached_features.p
10 | .ipynb_checkpoints
11 | .dvc/
12 | .hypothesis/
13 | *.joblib
14 | 
15 | # Local environment files (for e.g. API keys)
16 | .env
17 | *.pem
18 | 
19 | # Personal (e.g. editor) configuration
20 | .vscode/
21 | 
22 | # Data files
23 | *.csv
24 | *.feather
25 | *.gz
26 | *.model
27 | *.parquet
28 | *.pdf
29 | *.png
30 | *.pq
31 | *.npz
32 | data/token_frequency.csv
33 | pdfs/
34 | 
35 | # macOS system files
36 | .DS_Store
37 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/myint/autoflake
 3 |     rev: v1.4
 4 |     hooks:
 5 |       - id: autoflake
 6 |         args:
 7 |           [
 8 |             "--in-place",
 9 |             "--remove-all-unused-imports",
10 |             "--remove-unused-variable",
11 |           ]
12 |   - repo: https://github.com/pycqa/isort
13 |     rev: 5.5.4
14 |     hooks:
15 |       - id: isort
16 |         additional_dependencies: ["toml"]
17 |   - repo: https://github.com/ambv/black
18 |     rev: 20.8b1
19 |     hooks:
20 |       - id: black
21 |         language_version: python3.8
22 |   - repo: https://github.com/PyCQA/flake8
23 |     rev: 3.8.3
24 |     hooks:
25 |       - id: flake8
26 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8.6
 2 | 
 3 | ENV PYTHONFAULTHANDLER=1 \
 4 |     PYTHONHASHSEED=random \
 5 |     PYTHONUNBUFFERED=1 \
 6 |     PIP_DEFAULT_TIMEOUT=100 \
 7 |     PIP_DISABLE_PIP_VERSION_CHECK=1 \
 8 |     PIP_NO_CACHE_DIR=1
 9 | 
10 | # Install dependencies for pdfplumber.
11 | RUN apt-get update && apt-get install -y \
12 |     libmagickwand-dev ghostscript \
13 |     --no-install-recommends
14 | 
15 | # Allow imagemagick to read and write PDFs.
16 | RUN sed -i 's/<policy domain="coder" rights="none" pattern="PDF" \/>/<policy domain="coder" rights="read|write" pattern="PDF" \/>/' \
17 |     /etc/ImageMagick-6/policy.xml
18 | 
19 | # Get this out of the way early, because it takes so damn long -- we really want to cache it.
20 | RUN pip install "tensorflow==2.3.1"
21 | 
22 | # Install Poetry and project dependencies.
23 | RUN pip install "poetry==1.1.0"
24 | RUN poetry config virtualenvs.create false
25 | COPY pyproject.toml poetry.lock ./
26 | RUN poetry install --no-root
27 | 
28 | # Install an editable copy of the project.
29 | COPY . .
30 | RUN poetry install --no-interaction --no-ansi
31 | 
32 | CMD ["/bin/bash"]
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 project-deepform
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TEST_PATH=$(CURDIR)/tests
 2 | CONTAINER=deepform/deepform_learner:latest
 3 | 
 4 | .DEFAULT_GOAL := help
 5 | 
 6 | .PHONY: help
 7 | help: ## Show this help dialog
 8 | 	@grep -E '^[a-zA-Z/\._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 9 | 
10 | .PHONY: test
11 | test: docker-build  ## Run all the unit tests for the project
12 | 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
13 | 	pytest --verbose --color=yes tests
14 | 
15 | .PHONY: clean-all
16 | clean-all:
17 | 	-rm -r data/cache data/labeled data/tokenized data/training
18 | 	-rm data/training.parquet data/doc_index.parquet
19 | 
20 | .PHONY: docker-build
21 | docker-build: ## Build the docker container
22 | 	docker build -t $(CONTAINER) .
23 | 
24 | .PHONY: docker-stop
25 | docker-stop: ## Stop any running instances of the deepform docker container on this system
26 | 	-docker ps | grep $(CONTAINER) | cut -d' ' -f1 | xargs docker stop
27 | 
28 | .PHONY: docker-shell
29 | docker-shell: docker-stop docker-build ## Launch a shell into a docker container containing the required dependencies and data
30 | 	docker run -ti --rm --env-file=.env \
31 | 	--mount type=bind,source=$(CURDIR)/deepform,target=/deepform \
32 | 	--mount type=bind,source=$(CURDIR)/data,target=/data \
33 | 	$(CONTAINER)
34 | 
35 | .PHONY: docker-background
36 | docker-background: docker-stop docker-build ## Launch a docker container as a background process.
37 | 	docker run -td --rm --env-file=.env \
38 | 	--mount type=bind,source=$(CURDIR)/deepform,target=/deepform \
39 | 	--mount type=bind,source=$(CURDIR)/data,target=/data \
40 | 	$(CONTAINER)
41 | 
42 | # This was used by a previous version of our codebase.
43 | # data/training.parquet:
44 | # 	curl https://project-deepform.s3-us-west-1.amazonaws.com/training_data/training.parquet -o data/training.parquet
45 | 
46 | data/pdfs: data/2020_manifest.csv ## Downloads all PDFs to local storage. Not usually necessary.
47 | 	docker build -t $(CONTAINER) .
48 | 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) python -c "import pandas as pd; print('\n'.join(pd.read_csv('data/fcc-data-2020-labeled-manifest.csv').URL))" | xargs wget -P data/pdfs
49 | 
50 | # This is the command we used to produce the tokenized data, but it is cached in S3
51 | # data/tokenized: data/pdfs
52 | # 	docker build -t $(CONTAINER) .
53 | # 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) python -m deepform.data.tokenize_pdfs
54 | 
55 | data/tokenized:  ## Get document tokens from S3
56 | 	curl https://project-deepform.s3-us-west-1.amazonaws.com/training_data/token_data.tar.gz -o data/tokenized.tar.gz
57 | 	mkdir -p data/tokenized
58 | 	tar xf data/tokenized.tar.gz -C data/tokenized
59 | 
60 | data/token_frequency.csv: data/tokenized ## Produce token frequency csv file
61 | 	docker build -t $(CONTAINER) .
62 | 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
63 | 	python -m deepform.data.create_vocabulary
64 | 
65 | data/3_year_manifest.csv: data/2012_manifest.tsv data/2014_manifest.tsv data/2020_manifest.csv ## combine manifests from three yuears into one manifest with all three years data
66 | 	docker build -t $(CONTAINER) .
67 | 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
68 | 	python -m deepform.data.combine_manifests
69 | 
70 | data/doc_index.parquet: data/tokenized data/token_frequency.csv data/3_year_manifest.csv ## Create the training data from the token files and label manifest
71 | 	docker build -t $(CONTAINER) .
72 | 	docker run --rm --mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
73 | 	python -m deepform.data.add_features data/3_year_manifest.csv
74 | 
75 | .PHONY: train
76 | train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run full model training
77 | 	docker run --rm --env-file=.env \
78 | 	--mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
79 | 	python -um deepform.train
80 | 
81 | .PHONY: test-train
82 | test-train: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run training on a small sample to test and validate code
83 | 	docker run --rm --env-file=.env \
84 | 	--mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
85 | 	python -um deepform.train --len-train=100 --steps-per-epoch=3 --epochs=2 --log-level=DEBUG --use-wandb=0 --use-data-cache=0 --save-model=0 --doc-acc-max-sample-size=20 --render-results-size=3
86 | 
87 | .PHONY: sweep
88 | sweep: data/doc_index.parquet data/token_frequency.csv .env docker-build ## Run a weights and biases training sweep.
89 | 	docker run --rm --env-file=.env \
90 | 	--mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
91 | 	./init_sweep.sh
92 | 
93 | VERSION='stable'
94 | download-model: ## Download a model for use with the inference script
95 | 	docker run --rm --env-file=.env \
96 | 	--mount type=bind,source=$(CURDIR)/data,target=/data $(CONTAINER) \
97 | 	python -m deepform.artifacts --version $(VERSION)
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deepform
  2 | 
  3 | ![Python build](https://github.com/project-deepform/deepform/workflows/Python%20application/badge.svg)
  4 | ![Docker image](https://github.com/project-deepform/deepform/workflows/Docker%20Image%20CI/badge.svg)
  5 | 
  6 | Deepform is a project to extract information from TV and cable political advertising disclosure forms using deep learning.  [This public data, maintained by the FCC](https://publicfiles.fcc.gov/), is valuable to journalists but locked in PDFs.  Our goal is to provide the 2020 dataset for NLP/AI researchers and to make our method available to future data scientists working in this field.  Past projects have managed to produce similar data sets only with great manual effort or by addressing only the most common form types, ignoring the tail of hundreds of rare form types.  This work uses deep learning models that are able to generalize over form types and "learn" how to find five fields:
  7 | 
  8 | - Contract number (multiple documents can have the same number as a contract for future air dates is revised)
  9 | - Advertiser name (offen the name of a political [comittee](https://www.fec.gov/data/browse-data/?tab=committees) but not always)
 10 | - Start and end air dates dates (often known as "flight dates")
 11 | - Total amount paid for the ads
 12 | 
 13 | The [initial attempt to use deep learning for this work](https://github.com/jstray/deepform), in summer 2019 by Jonathan Stray achieved 90% accuracy extracting total spending from the PDFs in the (held out) test set, which shows that deep learning can generalize surprisingly well to previously unseen form types.  For a discussion of how the 2019 prototype works, see [this post](http://jonathanstray.com/extracting-campaign-finance-data-from-gnarly-pdfs-using-deep-learning).
 14 | 
 15 | ## Why?
 16 | 
 17 | This project is timely and relevant for a variety of reasons, some of them pertaining to this particular dataset and others to the method we are following.  
 18 | 
 19 | Election trasnsparency is an increasingly important component of the US electoral process and making this data available to journalists at low or no cost is key to that transparency.  As the data is archived in tens of thousands of non-machine-readable PDF files in hundreds of different formats, it is beyond the capacity of journalistic entities to extract it by hand in a useful way.  The data is available for purchase from private entities but we interviewed journalists who mentioned that the data comes with a price tag of $100K or more _per newspaper_ which wishes to use it.  
 20 | 
 21 | Past projects have used [volunteer labor](https://www.niemanlab.org/2012/12/crowdsourcing-campaign-spending-what-propublica-learned-from-free-the-files/) or [hand-coded form layouts](https://github.com/alexbyrnes/FCC-Political-Ads) to produce usable datasets.  Project Deepform replicates this data extraction using modern deep learning techniques.  This is desirable because we are not only positioned to produce a usable dataset in the context of the 2020 election but the method will be available to our team and other data science teams in the run up to all future US elections to produce similar datasets in the future.  
 22 | 
 23 | For our own purposes as members of the investigative data science community, Project Deepform functions as an open source springboard for future form extraction projects.  Projects of this kind are becoming widely popular as the tools have improved within the past half decade to make this work possible.  The general problem is known as "knowledge base construction" in the research community, and the current state of the art is achieved by multimodal systems such as [Fonduer](https://fonduer.readthedocs.io/en/latest/). A group at Google released [a paper](https://research.google/pubs/pub49122/) earlier in 2020 which describes a related process, Google also supports [Document Cloud AI](https://levelup.gitconnected.com/how-to-parse-forms-using-google-cloud-document-ai-68ad47e1c0ed) and others have made progress using [graph convolutional networks](https://link.springer.com/chapter/10.1007/978-3-030-21074-8_12).  
 24 | 
 25 | Finally, we have prepared this project dataset and its goals as a [benchmark project on Weights and Biases](https://wandb.ai/deepform/political-ad-extraction/benchmark).  Here, other data scientists are encouraged to improve on the baseline success rates we have attained.  
 26 | 
 27 | 
 28 | ## Setting up the Environment
 29 | 
 30 | The project is primarily intended to be run with [Docker](https://www.docker.com/products/docker-desktop), which eases issues with Python virtual environments, but it can also be run locally -- this is easiest to do with [Poetry](https://python-poetry.org/).  
 31 | 
 32 | ### Docker
 33 | 
 34 | To use Docker, you'll have to be running the daemon, which you can find and install from https://www.docker.com/products/docker-desktop. Fortunately, that's _all_ you need.
 35 | 
 36 | The project has a `Makefile` that covers most of the things you might want to do with the project. To get started, simply run
 37 | 
 38 | `make train`
 39 | 
 40 | or see below for other commands.
 41 | 
 42 | 
 43 | ### Poetry - dependency management and running locally
 44 | 
 45 | Deepform manages its dependencies with `Poetry`, which you only need if you want to run it locally or alter the project dependencies.  You can install Poetry using any of the methods listed in their [documentation](https://python-poetry.org/docs/#installation).  
 46 | 
 47 | If you want to run Deepform locally:
 48 | 
 49 | - run `poetry install` to install the deepform package and all of it's dependencies into a fresh virtual environment
 50 | - enter this environment with `poetry shell`
 51 | - or run a one-off command with `poetry run <command>`
 52 | 
 53 | Since deepform is an installed package inside the virtual environment Poetry creates, run the code as modules, e.g. `python -m deepform.train` instead of `python deepform/train.py` -- this insures that imports and relative paths work the way they should.
 54 | 
 55 | To update project dependencies:
 56 | 
 57 | - `poetry add <package>` adds a new python package as a requirement
 58 | - `poetry remove <package>` removes a package that's no longer needed
 59 | - `poetry update` updates all the dependencies to their latest non-conflicting versions
 60 | 
 61 | These three commands alter `pyproject.toml` and `poetry.lock`, which should be committed to git. Using them ensures that our project has reproducible builds.
 62 | 
 63 | 
 64 | ## Training Data
 65 | ### Getting the Training Data 
 66 | 
 67 | Running `make train` will acquire all the data you need _and_ will train the model.  The total training data for this project consists of three label manifests (discussed below in detail) and 20,000 .parquet files containing the tokens and geometry from the PDFs used in training.  Running `make train` will automatically run, in sequence, a series of commands which acquire, restructure and label the training data.  These commands can alternatively be run manually, in sequence.
 68 | 
 69 | 1. `make data/tokenized` downloads _all_ the unlabeled .parquet files (training and test) from an S3 bucket to the folder data/tokenized.  
 70 | 
 71 | 2. `make data/token_frequency.csv` constructs a vocabulary of tokens from all these .parquet files. 
 72 | 
 73 | 3. `make data/3_year_manifest.csv` combines three label manifests from three different election years (2012, 2014 and 2020) into a single manifest (`data/3_year_manifest.csv`) and includes a column 'year' to differentiate between the three years' data.  
 74 | 
 75 | 4. `make data/doc_index.parquet` will utilize the unlabeled .parquet files in the folder data/tokenized along with 3_year_manifest.csv (already in the repo) to generate a new set of _labeled_ .parquet files in the folder data/training containing the token and geometry along with a new columns for each of the five target types. This column is used to store the match percentage (for each token) between that token and the target in question.  This script also computes other relevant features such as whether the token is a date or a dollar amount which are fed into the model as additional features.  Some targets are more than one token in length so in these cases, this new column contains the likelihood that each token is a member of the target token string.
 76 | 
 77 | These multi-token matching process receives a value for the maximum number of tokens (n) which might match the target ("Obama For America" being 3 tokens long while "1/12/2020" is one token long.) Due to OCR errors, some dates and dollar amounts are more than one token in length.  We then calculate a match percentage for all strings of tokens of lengths (n, n-1, ... , 1).  The highest match is achieved when the number of tokens is correct and the tokens match the target from the label manifest.  Finally, since each token will participate in many match attempts, each token is assigned a match percentage which corresponds to the highest match it participated in.  This table shows how "Obama for America" might be found.   
 78 | 
 79 | ```
 80 | ...
 81 | token, n=1, n=2, n=3, n=4, n=5, ...
 82 | contract,.1,.2,.2,.2,.1,...
 83 | obama,.7,.6,.5,.4,.3,...
 84 | $45,000,.03,.6,.5,.3,.65,...
 85 | committee,.1,.6,.4,.75,.65,...
 86 | obama,.7,.8,1.0,.75,.65,...
 87 | for,.5,.8,1.0,.75,.65,...
 88 | america,.67,.81,1.0,.75,.65,...
 89 | 11/23/12,.03,.4,.4,.5,.6,...
 90 | 11/29/12,.03,.03,.2,.3,.2,.6,...
 91 | ...
 92 | ```
 93 | 
 94 | ### Form of the training data
 95 | All the data (training and test) for this project was originally raw PDFs, downloaable from the [FCC website](https://publicfiles.fcc.gov/) with up to 100,000 PDFs per election year. The _training_ data consists of some 20,000 of these PDFs, drawn from three different election years (2012, 2014 and 2020) according to available labels (see below), and three label manifests.
 96 | 
 97 | The orignal PDFs were OCRd, tokenized, and turned into .parquet files, one for each PDF. The .parquet files are each named with the document slug and contain all of that document's tokens and their geometry on the page.  Geometry is given in 1/100ths of an inch.  
 98 | 
 99 | The .parquet files are formatted as "tokens plus geometry" like this:
100 | 
101 | `473630-116252-0-13442821773323-_-pdf.parquet` contains
102 | 
103 | ```
104 | page,x0,y0,x1,y1,token
105 | 0,272.613,438.395,301.525,438.439,$275.00
106 | 0,410.146,455.811,437.376,455.865,Totals
107 | 0,525.84,454.145,530.288,454.189,6
108 | 0,556.892,454.145,592.476,454.189,"$1,170.00"
109 | 0,18.0,480.478,37.998,480.527,Time
110 | 0,40.5,480.478,66.51,480.527,Period
111 | ...
112 | ```
113 | 
114 | The document name (the `slug`) is a unique document identifier, ultimately from the source TSV. The page number runs from 0 to 1, and the bounding box is in the original PDF coordinate system. The actual token text is reproduced as `token`. 
115 | 
116 | These .parquet files still lack labels however.  Lables are provided in three "label manifests" for these three election years (2012, 2014 and 2020), each of which is a .csv or .tsv containing a column of file IDs (called slugs) and columns containing labels for each of the fields of interest for each document. Each year has a slighty different set of extracted fields, sometimes including additional extracted fields not used by the model in this repo. All three manifests are combined in data/3_year_manifest.csv. All three label manifests and the combined manifest are available in the `data` folder.  If they are not present they can be recovered from various sources as detailed below.  
117 | 
118 | Using the labels in 3_year_manifest.csv and the 20,000 unlabeled token files, labeled token files are produced in the folder `data/training` which have the following form.  These are the training data as provided to the model.
119 | 
120 | ```
121 | page	x0	y0	x1	y1	token	contract_num	advertiser	flight_from	flight_to	gross_amount	tok_id	length	digitness	is_dollar	log_amount	label
122 | 0	18	17.963	48.232	26.899	Contract	0	0.27	0	0	0	53	8	0	0	0	0
123 | 0	50.456	17.963	89.584	26.899	Agreement	0	0.33	0	0	0	115	9	0	0	0	0
124 | 0	474.001	17.963	505.137	26.899	1/15/20	0.4	0.26	0.38	0.88	0.22	0	8	0.75	0	0	0
125 | 0	414.781	65.213	445.917	74.149	1475302	1	0.26	0.4	0.27	0.67	0	7	1	1	14.204374	1
126 | 0	495.842	65.213	550.978	74.149	WOC12348242	0.33	0.26	0.32	0.32	0.19	663	11	0.72727275	0	0	0
127 | 0	183.909	90.193	298.949	101.363	www.gray.tv/advertising	0	0.58	0.06	0.06	0.06	1796	23	0	0	0	0
128 | 0	309.002	90.923	326.786	99.859	Mike	0	1	0	0	0	664	4	0	0	0	2
129 | 0	329.01	90.923	371.234	99.859	Bloomberg	0	1	0	0	0	821	9	0	0	0	2
130 | 0	373.458	90.923	393.474	99.859	2020,	0.33	1	0.31	0.46	0.67	0	5	0.8	0	0	2
131 | 0	395.698	90.923	407.258	99.859	Inc	0	1	0	0	0	166	3	0	0	0	2
132 | 0	491.041	90.683	522.177	99.619	12/31/19	0.27	0.74	0.88	0.5	0.22	0	8	0.75	0	0	0
133 | 0	308.251	103.463	338.483	112.399	Contract	0	0.24	0	0	0	53	8	0	0	0	0
134 | 0	340.707	103.463	361.603	112.399	Dates	0	0.23	0	0	0	18	5	0	0	0	0
135 | 0	407.251	103.463	438.371	112.399	Estimate	0	0.26	0	0	0	23	8	0	0	0	0
136 | 0	308.251	115.703	339.387	124.639	12/30/19	0.4	0.26	1	0.5	0.33	0	8	0.75	0	0	3
137 | 0	346.499	115.703	377.635	124.639	1/12/20	0.27	0.21	0.5	1	0.22	0	8	0.75	0	0	4
138 | ...
139 | ```
140 | 
141 | N.B. As it is written currently, the model only trains on the one thousand documents of 2020 data.  
142 | 
143 | ### Where the labels come from
144 | #### 2012 Label Manifest
145 | In 2012, ProPublica ran the Free The Files project (you can [read how it worked](https://www.niemanlab.org/2012/12/crowdsourcing-campaign-spending-what-propublica-learned-from-free-the-files/)) and hundreds of volunteers hand-entered information for over 17,000 of these forms. That data drove a bunch of campaign finance [coverage](https://www.propublica.org/series/free-the-files) and is now [available](https://www.propublica.org/datastore/dataset/free-the-files-filing-data) from their data store.
146 | 
147 | The label manifest for 2012 data was downloaded from Pro Publica and is located at `data/2012_manifest.tsv` (renamed from ftf-all-filings.tsv which is the filename it downloads as).  If the manifest is not present, it can be recovered from [their website](https://www.propublica.org/datastore/dataset/free-the-files-filing-data). This file contains the crowdsourced answers for some of our targets (omitting flight dates) and the PDF url.
148 | 
149 | #### 2014 Label Manifest
150 | In 2014 Alex Byrnes [automated](https://github.com/alexbyrnes/FCC-Political-Ads) a similar extraction by hand-coding form layouts. This works for the dozen or so most common form types but ignores the hundreds of different PDF layouts in the long tail. 
151 | 
152 | The label manifest for 2014 data, acquired from Alex's Github is `data/2014_manifest.tsv`.  If the manifest is not present, it can be recovered from [his github](https://github.com/alexbyrnes/FCC-Political-Ads) (renamed from 2014-orders.tsv which is the filename it downloads as). This file contains the crowdsourced answers for some of our targets (omitting 'gross amount').
153 | 
154 | 
155 | #### 2020 Label Manifest
156 | 
157 | ##### All 2020 PDFs
158 | Pdfs for the 2020 political ads and associated metadata were uploaded to [Overview Docs](https://www.overviewdocs.com/documentsets/22569). To collect the pdfs, the file names were pulled from the [FCC API OPIF file search](https://publicfiles.fcc.gov/developer/) using the search terms: order, contract, invoice, and receipt. The search was run with filters for campaign year set to 2020 and source service code set to TV. 
159 | 
160 | The FCC API search also returns the source service code (entity type, i.e. TV, cable), entity id, callsign, political file type (political ad or non-candidate issue ad), office type (presidential, senate, etc), nielsen dma rank (tv market area), network affiliation, and the time stamps for when the ad was created and last modified were pulled. These were added to the overview document set along with the search term, URL for the FCC download, and the date of the search.
161 | 
162 | For these .pdfs, the following steps were followed to produce training data:
163 | 
164 |  - Convert pdf to a series of images
165 |  - Determine angle of each page and rotate if needed
166 |  - Use tesseract to OCR each image
167 |  - Upload processed pdf to an S3 bucket and add URL to overview
168 |  - Upload additional metadata on whether OCR was needed, the original angle of each page, and any errors that occurred during the OCR process.  
169 | 
170 | ##### A Subset for Training
171 | [A sample of 1000 documents](https://www.overviewdocs.com/documentsets/22186) was randomly chosen for hand labeling as 2020 training data.  
172 | 
173 | The label manifest for 2020 data is `data/2020_manifest.csv` (renamed from fcc-data-2020-sample-updated.csv which is the filename it downloads as).  If the manifest is not present, it can be recovered from [this overview document set](https://www.overviewdocs.com/documentsets/22186). This file contains our manually entered answers for all of our five targets for the 1000 randomly chosen documents.
174 | 
175 | 
176 | ### Where the PDFs and token files come from
177 | #### Acquiring .parquet files directly
178 | 
179 | The best way to run this project is to acquire the 20,000 .parquet files containing the tokens and geometry for each PDF in the training set. The token files are downloaded from our S3 bucket by running `make data/tokenized`.  If you run `make train`, the program will automatically run `make data/tokenized` as this is a dependency for `make train`.  These .parquet files are then located in the folder data/tokenized.  This is the easiest way to get this data.  
180 | 
181 | #### Acquiring Raw PDFs
182 | 
183 | To find the original PDFs, it is always possible to return to the [FCC website](https://publicfiles.fcc.gov/) and download them directly using the proper filters (search terms: order, contract, invoice, and receipt, filters:  campaign year = 2020, source service code = TV).  Each of the 2012, 2014 and 2020 data which was used by Pro Publica, by Alex Byrnes or by ourselves to create the three label manifests can be found in a differnt location also as follows: 
184 | 
185 | ##### 2012 Training PDFs
186 | 
187 | 90% of the original PDFs from the Free the Files Project are available on DocumentCloud and can be recovered by running 'curl' on url = 'https://documentcloud.org/documents/' + slug + '.pdf'.  These PDFs can also be found in [this folder](https://drive.google.com/drive/folders/1bsV4A-8A9B7KZkzdbsBnCGKLMZftV2fQ?usp=sharing). If you download PDFs from one of these sources, locate them in the folder `data/PDFs`
188 | 
189 | ##### 2014 Training PDFs
190 | 
191 | [Alex Byrnes github](https://github.com/alexbyrnes/FCC-Political-Ads) directs users back to the [FCC website](https://publicfiles.fcc.gov/) to get his data.  He does not host it separately.   The PDFs are also available in [this google drive folder](https://drive.google.com/drive/folders/1aTuir0Y6WdD0P3SRUazo_82u7o8SnVf2).  If you download PDFs from one of these sources, locate them in the folder `data/PDFs`
192 | 
193 | ##### 2020 Training PDFs
194 | 
195 | The one thousand 2020 PDFs we hand labeled are available on Overview Docs as [this dataset](https://www.overviewdocs.com/documentsets/22186) 
196 | 
197 | These PDFs can also be acquired from the FCC database by running `make data/pdfs`.  This command will locate all the PDFs associated with 2020 training data in the folder `data/PDFs`
198 | 
199 | #### Converting Raw PDFs to .parquet files
200 | 
201 | If you have a set of PDF files located in `data/PDFs` and would like to tokenize those PDFs then you can run a line in the make file which is typically commented out.  Uncomment ` make data/tokenized: data/pdfs` and the associated lines below and comment out the other make command called data/tokenized.  This command will create the folder data/tokenized containing the .parquet files of tokens and geometry corresponding to each of the PDFs in `data/PDFs`.  
202 | 
203 | ## Training 
204 | ### How the model works
205 | 
206 | The easiest fields are contract number and total. This uses a fully connected three-layer network trained on a window of tokens from the data, typically 20-30 tokens. Each token is hashed to an integer mod 1000, then converted to 1-hot representation and embedded into 64 dimensions. This embedding is combined with geometry information (bounding box and page number) and also some hand-crafted "hint" features, such as whether the token matches a regular expression for dollar amounts. For details, see [the talk](https://www.youtube.com/watch?v=uNN59kJQ7CA).
207 | 
208 | We also incorporate custom "hint" features. For example, the total extractor uses an "amount" feature that is the log of the token value, if the token string is a number.
209 | 
210 | 
211 | ### Running in Docker
212 | 
213 | - `make test` to run all the unit tests for the project
214 | - `make docker-shell` will spin up a container and drop you into a bash shell after mounting the `deepform` folder of code so that commands that you run there reflect the code as you are editing it.
215 | - `make train` runs `deepform/train.py` with the default configuration. **If it needs to it will download and preprocess the data it needs to train on.**
216 | - `make test-train` runs the same training loop on the same data, but with very strongly reduced settings (just a few documents for a few steps) so that it can be used to check that it actually works.
217 | - `make sweep` runs a hyperparameter sweep with Weights & Biases, using the configuration in `sweep.yaml`
218 | 
219 | Some of these commands require an `.env` file located at the root of the project directory. 
220 | 
221 | If you don't want to use Weights & Biases, you can turn it off by setting `use_wandb=0`. You'll still need an `.env` file, but it can be empty.
222 | 
223 | ### Running Locally using Poetry
224 | 
225 | For each of the above commands, rather than running a make command which automatically runs in docker, run the python command which is a subsection of the make command.  I.e. rather than running `,make test-train`, run `python -um deepform.train --len-train=100 --steps-per-epoch=3 --epochs=2 --log-level=DEBUG --use-wandb=0 --use-data-cache=0 --save-model=0 --doc-acc-max-sample-size=20 --render-results-size=3`
226 | 
227 | ## Code quality and pre-commit hooks
228 | 
229 | The code is currently automatically formatted with [black](https://black.readthedocs.io/en/stable/), uses [autoflake](https://pypi.org/project/autoflake/) to remove unused imports, [isort](https://timothycrosley.github.io/isort/) to sort them, and [flake8](https://flake8.pycqa.org/en/latest/) to check for PEP8 violations. These tools are configured in `pyproject.toml` and should Just Work&trade; -- you shouldn't have to worry about them at all once you install them.
230 | 
231 | To make this as painless as possible, `.pre-commit-config.yaml` contains rules for automatically running these tools as part of `git commit`. To turn these git pre-commit hook on, you need run `pre-commit install` (after installing it and the above libraries with Poetry or pip). After that, whenever you run `git commit`, these tools will run and clean up your code so that "dirty" code never gets committed in the first place.
232 | 
233 | GitHub runs a "python build" Action whenever you push new code to a branch (configured in [python-app.yml](https://github.com/project-deepform/deepform/blob/master/.github/workflows/python-app.yml)). This also runs `black`, `flake8`, and `pytest`, so it's best to just make sure things pass locally before pushing to GitHub.
234 | 
235 | ## Looking Forward
236 | 
237 | This is a difficult data set that is very relevant to journalism, and improvements in technique will be immediately useful to campaign finance reporting.
238 | 
239 | Our next steps include additional pre-processing steps to rotate improperly scanned documents and to identify and separate concatenated documents.  The default parameter settings we are using are fairly good but can likely be improved further.  We have leads on additional training data which was produced via hand-labeling in a couple of different related projects which we are hoping to incorporate.  We believe there is potential here for some automated training data creation.  Finally, we are not at present making use of the available 2012 and 2014 training data and these daya may be able to dramatically improve model accuracy.  
240 | 
241 | We would love to hear from you! Contact jstray on [twitter](https://twitter.com/jonathanstray) or through his [blog](http://jonathanstray.com).
242 | 


--------------------------------------------------------------------------------
/config-defaults.yaml:
--------------------------------------------------------------------------------
  1 | wandb_version: 1
  2 | 
  3 | len_train:
  4 |   dec: number of documents to use (training + validation)
  5 |   value: 15000
  6 | 
  7 | # training dataset settings required for benchmark submissions
  8 | # do not change these if you'd like a pure comparison to the
  9 | # other benchmark submissions
 10 | val_split:
 11 |   value: 0.2
 12 | random_seed:
 13 |   value: 23
 14 | 
 15 | # sweeps suggest these are reasonable hyperparameter defaults
 16 | window_len:
 17 |   desc: size of token sequences to train on (and network size!)
 18 |   value: 25
 19 | 
 20 | # feature generation
 21 | pad_windows:
 22 |   desc: zero pad beginning and end of doc token stream
 23 |   value: 1
 24 | use_amount:
 25 |   desc: use token dollar value directly as feature
 26 |   value: 1
 27 | use_page:
 28 |   desc: use token page number as feature
 29 |   value: 1
 30 | use_geom:
 31 |   desc: use token geometry (bbox corner) as feature
 32 |   value: 1
 33 | use_string:
 34 |   desc: use token string as feature
 35 |   value: 1
 36 | use_hints:
 37 |   desc: use hard coded field names ("total") as features
 38 |   value: 1
 39 | 
 40 | vocab_size:
 41 |   desc: identify (1-hot encode) this many common tokens
 42 |   value: 512
 43 | vocab_embed_size:
 44 |   desc: number of outputs in the vocab embedding
 45 |   value: 16
 46 | 
 47 | # graph feature generation and utilization
 48 | use_adjacency_matrix:
 49 |   desc: whether to generate adjacency matrices and load them in documents
 50 |   value: 0
 51 | 
 52 | target_thresh:
 53 |   desc: throw away token matches to PP crowdsourced data that aren't at least this good
 54 |   value: 0.8
 55 | 
 56 | # network size
 57 | num_layers:
 58 |   desc: number of layers in model, 2 or 3
 59 |   value: 3
 60 | layer_1_size_factor:
 61 |   desc: layer 1 size = this factor * window_len * token_dims
 62 |   value: 4
 63 | layer_2_size_factor:
 64 |   desc: layer 2 size = this factor * window_len * token_dims
 65 |   value: 2
 66 | layer_3_size_factor:
 67 |   desc: layer 3 size = this factor * window_len * token_dims
 68 |   value: 1
 69 | dropout:
 70 |   value: 0.2
 71 | 
 72 | # training config
 73 | epochs:
 74 |   value: 50
 75 | steps_per_epoch:
 76 |   value: 50
 77 | batch_size:
 78 |   desc: batch size in windows (not docs)
 79 |   value: 10000
 80 | positive_fraction:
 81 |   desc: target match scores larger than this will becomes positive labels
 82 |   value: 0.5
 83 | permute_tokens:
 84 |   desc: randomly re-order tokens in each training window
 85 |   value: 0
 86 | 
 87 | penalize_missed:
 88 |   desc: how much more a missed 1 counts than a missed 0 in outputs
 89 |   value: 5
 90 | 
 91 | learning_rate:
 92 |   value: 0.001
 93 | 
 94 | # Affects prediction
 95 | predict_thresh:
 96 |   desc: predictions below this value count as predicting "None"
 97 |   value: 0.5
 98 | 
 99 | # These do not affect the training but control various setup and reporting
100 | render_results_size:
101 |   desc: log this many PDF images on last epoch
102 |   value: 20
103 | use_data_cache:
104 |   desc: use pickled saved training data (freezes options like padding, amount_feature)
105 |   value: 1
106 | doc_acc_max_sample_size:
107 |   desc: never sample more than this many documents
108 |   value: 1000
109 | doc_acc_sample_size:
110 |   desc: sample epoch+this documents to compute doc_val_acc (uses all docs on last epoch)
111 |   value: 10
112 | save_model:
113 |   desc: whether to save the trained model
114 |   value: 1
115 | model_path:
116 |   desc: path to save the model (if not set, autogenerate)
117 |   value: ""
118 | model_artifact_name:
119 |   desc: used to identify saved models in Weights & Biases
120 |   value: deepform-model
121 | use_wandb:
122 |   desc: report run to wandb and store annotations
123 |   value: 1
124 | log_level:
125 |   desc: minimum level to report in the logs
126 |   value: INFO
127 | 


--------------------------------------------------------------------------------
/data/create-training-data.py:
--------------------------------------------------------------------------------
  1 | # This takes the token file and does a number of things:
  2 | # - rejects documents with too few tokens (need OCR) or no ground truth
  3 | # - normalizes page numbers in 0..1
  4 | # - provides fuzzy matching scores for each token vs ground truth tokens
  5 | 
  6 | import csv
  7 | import decimal
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from fuzzywuzzy import fuzz
 12 | 
 13 | from util import is_dollar_amount, normalize_dollars
 14 | 
 15 | output_docs = 0
 16 | 
 17 | # Data in filings that we want to find.
 18 | # We output a column for each one of these, indicating how close the token is to
 19 | # the correct answer.
 20 | # For our first experiment, just extract gross_amount
 21 | # Other possible targets include 'committee','agency','callsign'
 22 | targets = ["gross_amount", "contract_number", "committee"]
 23 | 
 24 | filings = pd.read_csv("../source/ftf-all-filings.tsv", sep="\t")
 25 | 
 26 | incsv = pd.read_parquet("training.parquet")
 27 | 
 28 | outcols = ["slug", "page", "x0", "y0", "x1", "y1", "token", "gross_amount"] + targets
 29 | outcsv = csv.DictWriter(open("training.csv", mode="w"), fieldnames=outcols)
 30 | outcsv.writeheader()
 31 | 
 32 | 
 33 | # computes fuzzy distance from each token in the series to the target answer for
 34 | # the document answer may be multiple tokens, in which case we take the max of
 35 | # matches.
 36 | def multi_token_target_match(answer, tokens, target, max_n, anstok):
 37 |     best_match = [0 for i in range(max_n)]
 38 |     best_idx = [0 for i in range(max_n)]
 39 |     # Two dimensional because we will have one array for each possible n-gram length.
 40 |     ratioslist = np.zeros((max_n, len(tokens)))
 41 |     # For each possible number of tokens in answertoken:
 42 |     for i in range(max_n):
 43 |         # For each n-gram of that length in the doc:
 44 |         for idx in range(0, len(tokens) - i):
 45 |             # Make it one token so we can compare.
 46 |             token_string = "".join(str(t) for t in tokens[idx : idx + i + 1])
 47 |             # Compare and store the float in match.
 48 |             match = fuzz.ratio(anstok, token_string) / 100.0
 49 |             # Update the ratioslist matrix with this match value for the n-gram
 50 |             # length and index.
 51 |             ratioslist[i, idx] = match
 52 |             # Update our vector of best matches for each n-gram.
 53 |             if match > best_match[i]:
 54 |                 best_match[i] = match
 55 |                 best_idx[i] = idx
 56 |     print("best_match array: " + str(best_match))
 57 |     best_len = np.argmax(best_match) + 1
 58 |     best_match_idx = best_idx[best_len - 1]
 59 |     print("Best choice for number of tokens: " + str(best_len))
 60 |     print(
 61 |         "Best Match Token Sequence: "
 62 |         + str(tokens[best_match_idx : best_match_idx + best_len])
 63 |     )
 64 | 
 65 |     scores = np.zeros(len(tokens))
 66 | 
 67 |     # Make a list of all indices from ratioslist[np.argmax(best_match),:] which
 68 |     # have the best match.
 69 |     best_idx_list = [
 70 |         i
 71 |         for i, value in enumerate(ratioslist[np.argmax(best_match), :])
 72 |         if value == best_match[best_len - 1]
 73 |     ]
 74 |     print("Target Occurs at Indices: " + str(best_idx_list))
 75 | 
 76 |     # For each of these indices in scores, set the following best_len tokens
 77 |     # equal to best_match.
 78 |     for a in best_idx_list:
 79 |         for i in range(best_len):
 80 |             scores[a + i] = best_match[best_len - 1]
 81 | 
 82 |     return scores
 83 | 
 84 | 
 85 | def target_match(answer, tokens, target, max_n):
 86 |     print()
 87 |     print("target: " + target)
 88 |     print("answer: " + str(answer))
 89 |     anstok = (
 90 |         str(answer).lower().replace(" ", "")
 91 |     )  # Remove spaces and make the answer lower case
 92 |     tokens = [token.lower() for token in tokens]  # lowercase all the tokens also
 93 | 
 94 |     if target == "gross_amount":
 95 | 
 96 |         scores = []
 97 |         max_n = 1
 98 |         for token in tokens:
 99 |             if is_dollar_amount(anstok) and is_dollar_amount(token):
100 |                 try:
101 |                     scores.append(
102 |                         fuzz.ratio(normalize_dollars(anstok), normalize_dollars(token))
103 |                         / 100.0
104 |                     )
105 |                 except decimal.InvalidOperation:
106 |                     # not a number, maybe a date?
107 |                     scores.append(fuzz.ratio(anstok, token) / 100.0)
108 |             else:
109 |                 scores.append(fuzz.ratio(anstok, token) / 100.0)
110 | 
111 |     else:
112 |         scores = multi_token_target_match(answer, tokens, target, max_n, anstok)
113 | 
114 |     return scores
115 | 
116 | 
117 | def process_doc(slug, rows, max_n):
118 |     print()
119 |     print()
120 |     print("--------------------------------")
121 |     print(f"Processing {slug} with {len(rows)} tokens")
122 |     global output_docs
123 |     if len(rows) < 10:
124 |         # probably needs OCR
125 |         print(f"Skipping {slug} because it has only {len(rows)} tokens")
126 |         return
127 | 
128 |     answers = filings.loc[filings["dc_slug"] == slug]
129 |     if len(answers) != 1:
130 |         print(f"Skipping {slug} because it matches {len(answers)} rows")
131 |         return
132 |     answers = answers.iloc[0]
133 | 
134 |     if answers[targets].isnull().any():
135 |         print(
136 |             f"Skipping {slug} because it is missing answers for "
137 |             f"{[t for t in targets if pd.isnull(answers[t])]}"
138 |         )
139 |         return
140 | 
141 |     df = pd.DataFrame(rows)
142 | 
143 |     page = pd.to_numeric(df["page"])
144 |     maxpage = page.max()
145 |     if maxpage:  # avoid div/0 for one page docs
146 |         df["page"] = page / maxpage  # last page = 1.0
147 | 
148 |     for t in targets:
149 |         df[t] = target_match(
150 |             answers[t], df["token"].fillna(""), t, max_n
151 |         )  # The value of the answer and an array of the tokens for that slug
152 | 
153 |     for _, row in df.iterrows():
154 |         outcsv.writerow(row.to_dict())
155 | 
156 |     output_docs += 1
157 | 
158 | 
159 | # --- Main ---
160 | # Accumulate all rows with the same slug
161 | # active_rows = []
162 | # active_slug = None
163 | # input_docs = 0
164 | # max_n = 5
165 | #    for row in incsv:
166 | #     if row["slug"] != active_slug:
167 | #         if active_slug:
168 | #             process_doc(active_slug, active_rows, max_n)
169 | #             input_docs += 1
170 | #         active_slug = row["slug"]
171 | #         active_rows = [row]
172 | #     else:
173 | #         active_rows.append(row)
174 | 
175 | # print(f"Input documents {input_docs}")
176 | # print(f"Output documents {output_docs}")
177 | 
178 | 
179 | # --- Main ---
180 | # Accumulate all rows with the same slug
181 | active_rows = []
182 | # active_slug = None
183 | input_docs = 0
184 | max_n = 5
185 | # for row in incsv:
186 | #     if row["slug"] != active_slug:
187 | #         if active_slug:
188 | #             process_doc(active_slug, active_rows)
189 | #             input_docs += 1
190 | #         active_slug = row["slug"]
191 | #         active_rows = [row]
192 | #     else:
193 | #         active_rows.append(row)
194 | n = 0
195 | for slug, group in incsv.groupby("slug"):
196 |     process_doc(slug, group, max_n)
197 |     n += 1
198 |     if n > 200:
199 |         break
200 | # print(f"Input documents {input_docs}")
201 | # print(f"Output documents {output_docs}")
202 | 


--------------------------------------------------------------------------------
/deepform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/__init__.py


--------------------------------------------------------------------------------
/deepform/artifacts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import wandb
 4 | 
 5 | from deepform.common import MODEL_DIR, WANDB_PROJECT
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(description="download a model stored in W&B")
 9 |     parser.add_argument(
10 |         "-v",
11 |         "--version",
12 |         dest="version",
13 |         help="model version to download",
14 |         default="latest",
15 |     )
16 |     args = parser.parse_args()
17 | 
18 |     run = wandb.init(
19 |         project="model-download",
20 |         job_type="ps",
21 |         allow_val_change=True,
22 |     )
23 |     config = run.config
24 |     model_name = config.model_artifact_name
25 |     artifact_name = f"{WANDB_PROJECT}/{model_name}:{args.version}"
26 |     artifact = run.use_artifact(artifact_name, type="model")
27 |     artifact_alias = artifact.metadata.get("name") or "unknown"
28 |     artifact.download(root=(MODEL_DIR / artifact_alias))
29 | 


--------------------------------------------------------------------------------
/deepform/combine_manifests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from deepform.common import DATA_DIR
  7 | 
  8 | if os.path.exists(DATA_DIR / "3_year_manifest.csv"):
  9 |     os.remove(DATA_DIR / "3_year_manifest.csv")
 10 | 
 11 | 
 12 | df12 = pd.read_csv(
 13 |     DATA_DIR / "2012_manifest.tsv", sep="\t"
 14 | )  # Formerly called ftf-all-filings.tsv
 15 | df12.insert(0, "serial_num", np.nan)
 16 | df12.insert(0, "flight_from", np.nan)
 17 | df12.insert(0, "flight_to", np.nan)
 18 | df12.insert(0, "issues", np.nan)
 19 | df12_new = df12.filter(
 20 |     [
 21 |         "dc_slug",
 22 |         "serial_num",
 23 |         "gross_amount",
 24 |         "committee",
 25 |         "flight_from",
 26 |         "flight_to",
 27 |         "url",
 28 |         "issues",
 29 |     ],
 30 |     axis=1,
 31 | )
 32 | df12_new.insert(0, "year", "2012")
 33 | df12_new.columns = [
 34 |     "year",
 35 |     "file_id",
 36 |     "contract_num",
 37 |     "gross_amount",
 38 |     "advertiser",
 39 |     "flight_from",
 40 |     "flight_to",
 41 |     "url",
 42 |     "issues",
 43 | ]
 44 | 
 45 | df14 = pd.read_csv(
 46 |     DATA_DIR / "2014_manifest.tsv", sep="\t"
 47 | )  # Formerly called 2014-orders.tsv
 48 | df14.insert(0, "gross_amount", np.nan)
 49 | df14.insert(0, "url", np.nan)
 50 | df14.insert(0, "issues", np.nan)
 51 | df14_new = df14.filter(
 52 |     [
 53 |         "id",
 54 |         "order_revision",
 55 |         "gross_amount",
 56 |         "advertiser",
 57 |         "flight_from",
 58 |         "flight_to",
 59 |         "url",
 60 |         "issues",
 61 |     ],
 62 |     axis=1,
 63 | )
 64 | df14_new.insert(0, "year", "2014")
 65 | df14_new.columns = [
 66 |     "year",
 67 |     "file_id",
 68 |     "contract_num",
 69 |     "gross_amount",
 70 |     "advertiser",
 71 |     "flight_from",
 72 |     "flight_to",
 73 |     "url",
 74 |     "issues",
 75 | ]
 76 | 
 77 | df20 = pd.read_csv(
 78 |     DATA_DIR / "2020_manifest.csv"
 79 | )  # Formerly called fcc-data-2020-sample-updated.csv
 80 | df20_new = df20.filter(
 81 |     [
 82 |         "full_file_name",
 83 |         "serial_num",
 84 |         "gross_amount",
 85 |         "advertiser",
 86 |         "flight_from",
 87 |         "flight_to",
 88 |         "url",
 89 |         "Issues ('', Type, or Token)",
 90 |     ],
 91 |     axis=1,
 92 | )
 93 | df20_new.insert(0, "year", "2020")
 94 | df20_new.columns = [
 95 |     "year",
 96 |     "file_id",
 97 |     "contract_num",
 98 |     "gross_amount",
 99 |     "advertiser",
100 |     "flight_from",
101 |     "flight_to",
102 |     "url",
103 |     "issues",
104 | ]
105 | 
106 | df = pd.concat([df12_new, df14_new, df20_new])
107 | 
108 | # df.set_index(["year", "slug"]).count(level="year")
109 | 
110 | df.to_csv(DATA_DIR / "3_year_manifest.csv", index=False)
111 | 


--------------------------------------------------------------------------------
/deepform/common.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | S3_BUCKET = "project-deepform"
 4 | 
 5 | ROOT_DIR = Path(__file__).absolute().parents[1]
 6 | DATA_DIR = ROOT_DIR / "data"
 7 | LOG_DIR = ROOT_DIR / "logs"
 8 | PDF_DIR = DATA_DIR / "pdfs"
 9 | TOKEN_DIR = DATA_DIR / "tokenized"
10 | LABELED_DIR = DATA_DIR / "labeled"
11 | TRAINING_DIR = DATA_DIR / "training"
12 | TRAINING_INDEX = TRAINING_DIR.parent / "doc_index.parquet"
13 | MODEL_DIR = DATA_DIR / "models"
14 | 
15 | WANDB_PROJECT = "deepform_v1"
16 | 


--------------------------------------------------------------------------------
/deepform/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/data/__init__.py


--------------------------------------------------------------------------------
/deepform/data/add_features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Process a parquet of all training data to add labels and computed features.
  3 | 
  4 | Final data is stored individually (per-document) to enable random access of
  5 | small samples, with an index over all the documents.
  6 | """
  7 | 
  8 | import argparse
  9 | from concurrent.futures import ThreadPoolExecutor, as_completed
 10 | from enum import Enum, auto
 11 | from pathlib import Path
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import scipy.sparse as sparse
 16 | from fuzzywuzzy import fuzz
 17 | from tqdm import tqdm
 18 | 
 19 | from deepform.common import DATA_DIR, TOKEN_DIR, TRAINING_DIR, TRAINING_INDEX
 20 | from deepform.data.create_vocabulary import get_token_id
 21 | from deepform.data.graph_geometry import document_edges
 22 | from deepform.logger import logger
 23 | from deepform.util import (
 24 |     date_similarity,
 25 |     default_similarity,
 26 |     dollar_similarity,
 27 |     is_dollar_amount,
 28 |     log_dollar_amount,
 29 | )
 30 | 
 31 | 
 32 | class TokenType(Enum):
 33 |     NONE = 0
 34 |     CONTRACT_NUM = auto()
 35 |     ADVERTISER = auto()
 36 |     FLIGHT_FROM = auto()
 37 |     FLIGHT_TO = auto()
 38 |     GROSS_AMOUNT = auto()
 39 | 
 40 | 
 41 | LABEL_COLS = {
 42 |     # Each label column, and the match function that it uses.
 43 |     "contract_num": default_similarity,
 44 |     "advertiser": default_similarity,
 45 |     "flight_from": date_similarity,
 46 |     "flight_to": date_similarity,
 47 |     "gross_amount": dollar_similarity,
 48 | }
 49 | 
 50 | 
 51 | def extend_and_write_docs(
 52 |     source_dir,
 53 |     manifest,
 54 |     pq_index,
 55 |     out_path,
 56 |     max_token_count,
 57 |     use_adjacency_matrix=False,
 58 | ):
 59 |     """Split data into individual documents, add features, and write to parquet."""
 60 | 
 61 |     token_files = {p.stem: p for p in source_dir.glob("*.parquet")}
 62 | 
 63 |     jobqueue = []
 64 |     for row in manifest.itertuples():
 65 |         slug = row.file_id
 66 |         if slug not in token_files:
 67 |             logger.error(f"No token file for {slug}")
 68 |             continue
 69 |         labels = {}
 70 |         for label_col in LABEL_COLS:
 71 |             labels[label_col] = getattr(row, label_col)
 72 |             if not labels[label_col]:
 73 |                 logger.warning(f"'{label_col}' for {slug} is empty")
 74 |         jobqueue.append(
 75 |             {
 76 |                 "token_file": token_files[slug],
 77 |                 "dest_file": out_path / f"{slug}.parquet",
 78 |                 "graph_file": out_path / f"{slug}.graph",
 79 |                 "labels": labels,
 80 |                 "max_token_count": max_token_count,
 81 |                 "use_adjacency_matrix": use_adjacency_matrix,
 82 |             }
 83 |         )
 84 | 
 85 |     # Spin up a bunch of jobs to do the conversion
 86 |     with ThreadPoolExecutor() as executor:
 87 |         doc_jobs = []
 88 |         for kwargs in jobqueue:
 89 |             doc_jobs.append(executor.submit(process_document_tokens, **kwargs))
 90 | 
 91 |         logger.debug("Waiting for jobs to complete")
 92 |         progress = tqdm(as_completed(doc_jobs), total=len(doc_jobs))
 93 |         doc_results = [j.result() for j in progress]
 94 | 
 95 |     logger.debug(f"Writing document index to {pq_index}...")
 96 |     doc_index = pd.DataFrame(doc_results).set_index("slug", drop=True)
 97 |     doc_index.to_parquet(pq_index)
 98 | 
 99 | 
100 | def pq_index_and_dir(pq_index, pq_path=None):
101 |     """Get directory for sharded training data, creating if necessary."""
102 |     pq_index = Path(pq_index).resolve()
103 |     if pq_path is None:
104 |         pq_path = TRAINING_DIR
105 |     else:
106 |         pq_path = Path(pq_path)
107 |     pq_index.parent.mkdir(parents=True, exist_ok=True)
108 |     pq_path.mkdir(parents=True, exist_ok=True)
109 |     return pq_index, pq_path
110 | 
111 | 
112 | def process_document_tokens(
113 |     token_file,
114 |     dest_file,
115 |     graph_file,
116 |     labels,
117 |     max_token_count,
118 |     use_adjacency_matrix=False,
119 | ):
120 |     """Filter out short tokens, add computed features, and return index info."""
121 |     slug = token_file.stem
122 |     tokens = pd.read_parquet(token_file).reset_index(drop=True)
123 |     doc, adjacency, best_matches = compute_features(
124 |         tokens, labels, max_token_count, use_adjacency_matrix=use_adjacency_matrix
125 |     )
126 |     doc.to_parquet(dest_file, index=False)
127 |     if adjacency is not None:
128 |         write_adjacency(graph_file, adjacency)
129 |     # Return the summary information about the document.
130 |     return {"slug": slug, "length": len(doc), **labels, **best_matches}
131 | 
132 | 
133 | def compute_features(tokens, labels, max_token_count, use_adjacency_matrix=False):
134 |     doc = label_tokens(tokens, labels, max_token_count)
135 | 
136 |     # Strip whitespace off all tokens.
137 |     doc["token"] = doc.token.str.strip()
138 | 
139 |     # Remove tokens shorter than three characters.
140 |     doc = doc[doc.token.str.len() >= 3]
141 | 
142 |     # Extend with the straightforward features.
143 |     doc = add_base_features(doc)
144 | 
145 |     # Handle the features that need the whole document.
146 |     doc["label"] = np.zeros(len(doc), dtype="u1")
147 |     # The "label" column stores the TokenType that correctly labels this token.
148 |     # By default this is 0, or "NONE".
149 |     best_matches = {}
150 |     for feature in LABEL_COLS:
151 |         token_value = TokenType[feature.upper()].value
152 |         max_score = doc[feature].max()
153 |         best_matches[f"best_match_{feature}"] = max_score
154 |         matches = token_value * np.isclose(doc[feature], max_score)
155 |         doc["label"] = np.maximum(doc["label"], matches)
156 | 
157 |     adjacency = document_edges(doc) if use_adjacency_matrix else None
158 |     return doc, adjacency, best_matches
159 | 
160 | 
161 | def write_adjacency(graph_file, adjacency):
162 |     sparse.save_npz(f"{graph_file}.npz", adjacency)
163 | 
164 | 
165 | def read_adjacency(graph_file):
166 |     return sparse.load_npz(f"{graph_file}.npz")
167 | 
168 | 
169 | def label_tokens(tokens, labels, max_token_count):
170 |     for col_name, label_value in labels.items():
171 |         tokens[col_name] = 0.0
172 |         match_fn = LABEL_COLS[col_name]
173 | 
174 |         if col_name == "advertiser":
175 |             tokens[col_name] = label_multitoken(
176 |                 tokens.token.to_numpy(), label_value, max_token_count, match_fn
177 |             )
178 |         else:
179 |             tokens[col_name] = tokens.token.apply(match_fn, args=(label_value,))
180 | 
181 |     return tokens
182 | 
183 | 
184 | def label_multitoken(tokens, value, token_count, match_fn=default_similarity):
185 |     best_match_values = np.array([match_fn(value, x) for x in tokens])
186 |     for c in range(1, token_count):
187 |         texts = [" ".join(tokens[i - c : i]) for i in range(c, tokens.size)]
188 |         match_values = np.array([match_fn(value, x) for x in texts] + [0] * c)
189 |         for p in range(c):
190 |             best_match_values = np.maximum(best_match_values, np.roll(match_values, p))
191 |     return best_match_values
192 | 
193 | 
194 | def fraction_digits(s):
195 |     """Return the fraction of a string that is composed of digits."""
196 |     return np.mean([c.isdigit() for c in s]) if isinstance(s, str) else 0.0
197 | 
198 | 
199 | def match_string(a, b):
200 |     m = fuzz.ratio(a.lower(), b.lower()) / 100.0
201 |     return m if m >= 0.9 else 0
202 | 
203 | 
204 | def add_base_features(token_df):
205 |     """Extend a DataFrame with features that can be pre-computed."""
206 |     df = token_df.copy()
207 |     df["tok_id"] = df["token"].apply(get_token_id).astype("u2")
208 |     df["length"] = df["token"].str.len().astype("i2")
209 |     df["digitness"] = df["token"].apply(fraction_digits).astype("f4")
210 |     df["is_dollar"] = df["token"].apply(is_dollar_amount).astype("f4")
211 |     df["log_amount"] = df["token"].apply(log_dollar_amount).fillna(0).astype("f4")
212 | 
213 |     return df
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     parser = argparse.ArgumentParser(description=__doc__)
218 |     parser.add_argument(
219 |         "manifest",
220 |         help="CSV with labels for each document",
221 |         default=DATA_DIR / "3_year_manifest.csv",
222 |     )
223 |     parser.add_argument(
224 |         "indir",
225 |         nargs="?",
226 |         default=TOKEN_DIR,
227 |         help="directory of document tokens",
228 |     )
229 |     parser.add_argument(
230 |         "indexfile",
231 |         nargs="?",
232 |         default=TRAINING_INDEX,
233 |         help="path to index of resulting parquet files",
234 |     )
235 |     parser.add_argument(
236 |         "outdir",
237 |         nargs="?",
238 |         default=TRAINING_DIR,
239 |         help="directory of parquet files",
240 |     )
241 |     parser.add_argument(
242 |         "--max-token-count",
243 |         type=int,
244 |         default=5,
245 |         help="maximum number of contiguous tokens to match against each label",
246 |     )
247 |     parser.add_argument(
248 |         "--compute-graph", dest="use_adjacency_matrix", action="store_true"
249 |     )
250 |     parser.set_defaults(use_adjacency_matrix=False)
251 | 
252 |     parser.add_argument("--log-level", dest="log_level", default="INFO")
253 |     args = parser.parse_args()
254 |     logger.setLevel(args.log_level.upper())
255 | 
256 |     logger.info(f"Reading {Path(args.manifest).resolve()}")
257 |     manifest = pd.read_csv(args.manifest)
258 | 
259 |     indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
260 |     index.parent.mkdir(parents=True, exist_ok=True)
261 |     outdir.mkdir(parents=True, exist_ok=True)
262 |     extend_and_write_docs(
263 |         indir,
264 |         manifest,
265 |         index,
266 |         outdir,
267 |         args.max_token_count,
268 |         use_adjacency_matrix=args.use_adjacency_matrix,
269 |     )
270 | 


--------------------------------------------------------------------------------
/deepform/data/create_vocabulary.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from string import ascii_lowercase
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from deepform.common import DATA_DIR, TOKEN_DIR
 7 | 
 8 | VOCAB_FILE = DATA_DIR / "token_frequency.csv"
 9 | 
10 | 
11 | def clean(token):
12 |     """Convert to lowercase and strip out anything other than ascii letters."""
13 |     return "".join(c for c in token.casefold() if c in ascii_lowercase)
14 | 
15 | 
16 | def per_document_tokens():
17 |     """Generator that produces the unique set of tokens for each document."""
18 |     for doc in TOKEN_DIR.glob("*.parquet"):
19 |         yield pd.read_parquet(doc, columns=["token"]).token.apply(clean).unique()
20 | 
21 | 
22 | def per_document_token_count():
23 |     counts = Counter()
24 |     for tokens in per_document_tokens():
25 |         counts.update(tokens)
26 |     return counts
27 | 
28 | 
29 | def create_frequency_file():
30 |     counts = per_document_token_count()
31 |     counts_df = pd.DataFrame(counts.most_common(), columns=["token", "count"])
32 |     counts_df.to_csv(VOCAB_FILE)
33 | 
34 | 
35 | def token_frequencies():
36 |     if not VOCAB_FILE.is_file():
37 |         create_frequency_file()
38 |     return pd.read_csv(VOCAB_FILE)
39 | 
40 | 
41 | class Vocabulary:
42 |     def __init__(self):
43 |         vocab = token_frequencies().token
44 |         self.token_ids = {t: i + 1 for i, t in enumerate(vocab)}
45 | 
46 |     def __getitem__(self, token):
47 |         # Unrecognized words are assigned to 0.
48 |         return self.token_ids.get(clean(token), 0)
49 | 
50 | 
51 | def get_token_id(token):
52 |     global _vocabulary_singleton
53 |     try:
54 |         return _vocabulary_singleton[token]
55 |     except NameError:
56 |         _vocabulary_singleton = Vocabulary()
57 |         return _vocabulary_singleton[token]
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     create_frequency_file()
62 | 


--------------------------------------------------------------------------------
/deepform/data/graph_geometry.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import numpy.ma as ma
 3 | import scipy.sparse as sparse
 4 | 
 5 | 
 6 | def document_edges(tokens, relative_tolerance=0.01):
 7 |     """"""
 8 |     N = len(tokens)
 9 | 
10 |     # For now we compute alignment of text tokens based on their lower left corner.
11 |     dX = np.subtract.outer(tokens["x0"].to_numpy(), tokens["x0"].to_numpy())
12 |     dY = np.subtract.outer(tokens["y1"].to_numpy(), tokens["y1"].to_numpy())
13 |     page_mask = np.not_equal.outer(tokens["page"].to_numpy(), tokens["page"].to_numpy())
14 | 
15 |     D = np.abs(dX) + np.abs(dY)
16 |     V_sim = dY / D
17 |     H_sim = dX / D
18 | 
19 |     dX_h_aligned = ma.masked_where(
20 |         np.logical_or(
21 |             page_mask,
22 |             np.logical_not(np.isclose(np.abs(H_sim), 1, rtol=relative_tolerance)),
23 |         ),
24 |         dX,
25 |     )
26 |     dY_v_aligned = ma.masked_where(
27 |         np.logical_or(
28 |             page_mask,
29 |             np.logical_not(np.isclose(np.abs(V_sim), 1, rtol=relative_tolerance)),
30 |         ),
31 |         dY,
32 |     )
33 | 
34 |     test_right = ma.masked_where(np.greater(dX_h_aligned, 0), dX_h_aligned)
35 |     test_bottom = ma.masked_where(np.greater(dY_v_aligned, 0), dY_v_aligned)
36 | 
37 |     right_max = np.argmax(test_right, axis=0)
38 |     bottom_max = np.argmax(test_bottom, axis=0)
39 | 
40 |     adjacency = sparse.lil_matrix((N, N), dtype=np.bool_)
41 | 
42 |     for i in range(len(tokens)):
43 |         if dX_h_aligned[i, right_max[i]]:
44 |             adjacency[i, right_max[i]] = True
45 |             adjacency[right_max[i], i] = True
46 |         if dY_v_aligned[i, bottom_max[i]]:
47 |             adjacency[i, bottom_max[i]] = True
48 |             adjacency[bottom_max[i], i] = True
49 | 
50 |     return adjacency.tocoo()
51 | 


--------------------------------------------------------------------------------
/deepform/data/tokenize_pdfs.py:
--------------------------------------------------------------------------------
  1 | """Create token data for each of the pdfs (or directories of pdfs) passed in."""
  2 | 
  3 | 
  4 | import argparse
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | from pathlib import Path
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import pdfplumber
 11 | from tqdm import tqdm
 12 | 
 13 | from deepform.common import PDF_DIR, TOKEN_DIR
 14 | from deepform.data.add_features import add_base_features
 15 | from deepform.document import FEATURE_COLS, Document
 16 | from deepform.logger import logger
 17 | from deepform.pdfs import get_pdf_path
 18 | 
 19 | 
 20 | def tokenize_pdf(pdf_path):
 21 |     """Return a DataFrame of document token data for a pdf at the input path."""
 22 |     pages = []
 23 |     for i, page in enumerate(pdfplumber.open(pdf_path).pages):
 24 |         df = pd.DataFrame(page.extract_words())
 25 |         if df.empty:
 26 |             continue
 27 |         df["page"] = i
 28 |         df["page"] = df["page"].astype("i2")
 29 |         df["x0"] = df["x0"].astype("f4")
 30 |         df["y0"] = df["top"].astype("f4")
 31 |         df["x1"] = df["x1"].astype("f4")
 32 |         df["y1"] = df["bottom"].astype("f4")
 33 |         df["token"] = df["text"].astype("string")
 34 |         pages.append(df[["page", "x0", "y0", "x1", "y1", "token"]])
 35 |     if not pages:
 36 |         raise EOFError(f"No tokens found in {pdf_path}")
 37 |     return pd.concat(pages).reset_index(drop=True)
 38 | 
 39 | 
 40 | def create_token_doc(pdf_path, token_dir=TOKEN_DIR, overwrite=False):
 41 |     pdf_path, token_dir = Path(pdf_path), Path(token_dir)
 42 |     assert pdf_path.is_file() and pdf_path.suffix == ".pdf"
 43 | 
 44 |     slug = pdf_path.stem
 45 |     token_path = token_dir / f"{slug}.parquet"
 46 |     if token_path.is_file():
 47 |         if overwrite:
 48 |             logger.warning(f"Overwriting {token_path}")
 49 |         else:
 50 |             return
 51 | 
 52 |     try:
 53 |         tokens = tokenize_pdf(pdf_path)
 54 |     except EOFError:
 55 |         logger.warning(f"pdfplumber found no tokens in '{pdf_path}'")
 56 |         return
 57 |     except Exception as e:
 58 |         logger.error(f"Unable to tokenize {pdf_path}: {e}")
 59 |         return
 60 | 
 61 |     token_dir.mkdir(parents=True, exist_ok=True)
 62 |     tokens.to_parquet(token_path)
 63 |     return token_path
 64 | 
 65 | 
 66 | def pdf_paths(*paths):
 67 |     for path in paths:
 68 |         path = Path(path)
 69 |         if path.is_file():
 70 |             if path.suffix != ".pdf":
 71 |                 logger.warning(f"Skipping non-pdf '{path}'")
 72 |                 continue
 73 |             yield path
 74 |         elif path.is_dir():
 75 |             for file_path in path.glob("*.pdf"):
 76 |                 yield file_path
 77 |         else:
 78 |             logger.warning(f"'{path}' is not a file or directory")
 79 | 
 80 | 
 81 | def create_token_docs_from_pdfs(*paths, overwrite=False):
 82 | 
 83 |     with ThreadPoolExecutor() as executor:
 84 |         pdf_files = list(pdf_paths(*paths))
 85 |         print(f"Tokenizing {len(pdf_files):,} pdfs...")
 86 |         results = list(
 87 |             tqdm(executor.map(create_token_doc, pdf_files), total=len(pdf_files))
 88 |         )
 89 | 
 90 |     tokenized = [p for p in results if p]
 91 |     print(f"Tokenized {len(tokenized)} documents.")
 92 |     return tokenized
 93 | 
 94 | 
 95 | def create_token_docs_from_slugs(slugs, token_dir=TOKEN_DIR):
 96 |     def tokenize(slug):
 97 |         pdf_file = get_pdf_path(slug)
 98 |         return create_token_doc(pdf_file, token_dir=token_dir)
 99 | 
100 |     with ThreadPoolExecutor() as executor:
101 |         print(f"Acquiring and tokenizing {len(slugs):,} documents...")
102 |         results = list(tqdm(executor.map(tokenize, slugs), total=len(slugs)))
103 | 
104 |     tokenized = [p for p in results if p]
105 |     print(f"Tokenized {len(tokenized)} documents.")
106 |     return tokenized
107 | 
108 | 
109 | def extract_doc(pdf_path, window_len):
110 |     """Create a Document with features extracted from a pdf."""
111 |     pdf_path = Path(pdf_path)
112 |     tokens = tokenize_pdf(pdf_path)
113 |     # Remove tokens shorter than three characters.
114 |     df = tokens[tokens["token"].str.len() >= 3]
115 |     df = add_base_features(df)
116 |     df["tok_id"] = np.minimum(511, df["tok_id"])
117 |     return Document(
118 |         slug=pdf_path.stem,
119 |         tokens=df,
120 |         features=df[FEATURE_COLS].to_numpy(dtype=float),
121 |         labels=np.zeros(len(df), dtype=bool),  # Dummy.
122 |         positive_windows=np.array(0),  # Dummy.
123 |         window_len=window_len,
124 |         label_values={},
125 |     )
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     parser = argparse.ArgumentParser(description=__doc__)
130 |     parser.add_argument(
131 |         "-f",
132 |         "--force",
133 |         type=bool,
134 |         default=False,
135 |         help="overwrite existing token files",
136 |     )
137 |     parser.add_argument(
138 |         "pdf",
139 |         nargs="?",
140 |         default=PDF_DIR,
141 |         help="pdf or directory of pdfs to process",
142 |     )
143 |     parser.add_argument("--log-level", dest="log_level", default="ERROR")
144 |     args = parser.parse_args()
145 |     logger.setLevel(args.log_level.upper())
146 | 
147 |     create_token_docs_from_pdfs(args.pdf, overwrite=args.force)
148 | 


--------------------------------------------------------------------------------
/deepform/db/.env:
--------------------------------------------------------------------------------
1 | MYSQL_ROOT_PASSWORD=changeme
2 | 


--------------------------------------------------------------------------------
/deepform/db/README.md:
--------------------------------------------------------------------------------
 1 | # Database
 2 | 
 3 | Tokenized data is stored in a Mariadb database. To run Mariadb locally:
 4 | 
 5 | ## Setup
 6 | 
 7 | Although we're running Mariadb in a Docker container, you'll probably want the MySQL command line utilities. If you don't already have these, you can install them with `brew install mysql` in OS X.
 8 | 
 9 | To run the Docker container, run the following command optionally changing the password set in `.env`.
10 | 
11 | ```
12 | docker run --name mariadb -v data:/var/lib/mysql -v conf:/etc/mysql/conf.d --env-file .env -p=3306:3306 -d mariadb:10.5.1
13 | ```
14 | 
15 | The data loading scripts are useful for loading the example data into the database and assume execution from this directory. The scripts also assume the existence of the files `source/ftf-all-filings.tsv` and `data/training.csv` in this repository.
16 | 
17 | ```
18 | mysql -uroot -p --protocol tcp < scripts/create_schema.sql
19 | mysql -uroot -p --protocol tcp deepform < scripts/load_document_data.sql
20 | mysql -uroot -p --protocol tcp deepform < scripts/load_token_data.sql
21 | ```
22 | 
23 | ## Further notes
24 | 
25 | When running a Mariadb database in Docker, you'll need to specify the protocol to use when interacting with the database like so:
26 | 
27 | ```
28 | mysql -uroot -p -e "SHOW CREATE DATABASE deepform;" --protocol tcp deepform
29 | ```
30 | 
31 | The `mysql` command defaults to using unix file sockets if no protocol is specified, and won't connect to the database.
32 | 


--------------------------------------------------------------------------------
/deepform/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-deepform/deepform/e6e1ff5a78e49cbc3c0625e4373b5b26f669e79f/deepform/db/__init__.py


--------------------------------------------------------------------------------
/deepform/db/conf/config-file.cnf:
--------------------------------------------------------------------------------
1 | bind-address=0.0.0.0
2 | 


--------------------------------------------------------------------------------
/deepform/db/scripts/create_schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE `deepform`;
 2 | 
 3 | USE `deepform`;
 4 | 
 5 | CREATE TABLE `document` (
 6 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 7 |   `dc_slug` varchar(256) DEFAULT NULL,
 8 |   `filing_type` varchar(256) DEFAULT NULL,
 9 |   `contract_number` int(11) DEFAULT NULL,
10 |   `url` varchar(256) DEFAULT NULL,
11 |   `committee` varchar(256) DEFAULT NULL,
12 |   `agency` varchar(256) DEFAULT NULL,
13 |   `callsign` varchar(10) DEFAULT NULL,
14 |   `thumbnail_url` varchar(256) DEFAULT NULL,
15 |   `market_id` int(11) DEFAULT NULL,
16 |   `upload_date` datetime DEFAULT NULL,
17 |   `gross_amount_usd` double DEFAULT NULL,
18 |   PRIMARY KEY (`id`),
19 |   UNIQUE KEY `uniq_dc_slug` (`dc_slug`)
20 | ) ENGINE=InnoDB AUTO_INCREMENT=68175 DEFAULT CHARSET=latin1;
21 | 
22 | CREATE TABLE `token` (
23 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
24 |   `dc_slug` varchar(256) DEFAULT NULL,
25 |   `page` float DEFAULT NULL,
26 |   `x0` double DEFAULT NULL,
27 |   `y0` double DEFAULT NULL,
28 |   `x1` double DEFAULT NULL,
29 |   `y1` double DEFAULT NULL,
30 |   `token` varchar(256) DEFAULT NULL,
31 |   `gross_amount` double DEFAULT NULL,
32 |   PRIMARY KEY (`id`),
33 |   KEY `origin_document` (`dc_slug`)
34 | ) ENGINE=InnoDB AUTO_INCREMENT=14024491 DEFAULT CHARSET=latin1;
35 | 


--------------------------------------------------------------------------------
/deepform/db/scripts/load_document_data.sql:
--------------------------------------------------------------------------------
1 | LOAD DATA LOCAL INFILE '../source/ftf-all-filings.tsv'
2 |   INTO TABLE document
3 |   COLUMNS TERMINATED BY '\t'
4 |   IGNORE 1 LINES
5 |   (id, filing_type, contract_number, url, committee, agency, callsign, dc_slug, thumbnail_url, gross_amount_usd, market_id, upload_date);
6 | 


--------------------------------------------------------------------------------
/deepform/db/scripts/load_token_data.sql:
--------------------------------------------------------------------------------
 1 | ALTER TABLE token DISABLE KEYS;
 2 | BEGIN;
 3 | LOAD DATA LOCAL INFILE '../data/training.csv'
 4 |   INTO TABLE token
 5 |   COLUMNS TERMINATED BY ','
 6 |   IGNORE 1 LINES
 7 |   (dc_slug,page,x0,y0,x1,y1,token,gross_amount);
 8 | COMMIT;
 9 | ALTER TABLE token ENABLE KEYS;
10 | 


--------------------------------------------------------------------------------
/deepform/db/source.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | import pandas as pd
 4 | from sqlalchemy import create_engine
 5 | 
 6 | charset = string.printable + "\t\n\x00"
 7 | 
 8 | 
 9 | def connection(user, password, host="127.0.0.1", port=3306, dbname="deepform"):
10 |     engine = create_engine(
11 |         f"mysql+mysqldb://{user}:{password}@{host}:{port}/{dbname}", pool_recycle=3600
12 |     )
13 |     return engine.connect()
14 | 
15 | 
16 | def clean_text(text):
17 |     def clean_char(c):
18 |         if c in charset:
19 |             return c
20 |         else:
21 |             return "\x00"
22 | 
23 |     return [clean_char(x) for x in text]
24 | 
25 | 
26 | def input_generator(conn, max_docs=10, truncate_length=3000):
27 |     documents = pd.read_sql(
28 |         f"select * from document "
29 |         f"where committee != '' order by rand() limit {max_docs};",
30 |         conn,
31 |     )
32 |     for document in documents.itertuples():
33 |         doc_id = document.dc_slug
34 |         tokens = pd.read_sql(f"select * from token where dc_slug = '{doc_id}';", conn)
35 |         text = " ".join([str(token) for token in tokens["token"]])
36 |         # yield clean_text(text), clean_text(document.committee)
37 |         yield text, document.committee
38 | 
39 | 
40 | def input_docs(conn, max_docs=10, minimum_doc_length=30):
41 |     try:
42 |         emitted_docs = 0
43 |         raw_conn = conn.engine.raw_connection()
44 |         cursor = raw_conn.cursor()
45 |         cursor.execute(
46 |             "select dc_slug, committee, gross_amount_usd from document where committee \
47 |             != '' order by rand()"
48 |         )
49 |         while emitted_docs < max_docs:
50 |             doc = cursor.fetchone()
51 |             if doc:
52 |                 dc_slug, committee, gross_amount_usd = (doc[0], doc[1], doc[2])
53 |                 rows = pd.read_sql(
54 |                     f"select * from token where dc_slug = '{dc_slug}';", conn
55 |                 )
56 |                 if len(rows) < minimum_doc_length:
57 |                     continue
58 |                 else:
59 |                     yield dc_slug, committee, gross_amount_usd, rows
60 |                     emitted_docs += 1
61 |             else:
62 |                 break
63 |     finally:
64 |         conn.close()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     conn = connection("root", "changeme")
69 |     docs = input_docs(conn)
70 |     for doc in docs:
71 |         print(doc)
72 |         print("*****")
73 | 


--------------------------------------------------------------------------------
/deepform/document.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from dataclasses import dataclass
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy.sparse as sparse
  8 | 
  9 | from deepform.data.add_features import TokenType, read_adjacency
 10 | from deepform.features import fix_dtypes
 11 | from deepform.util import any_match, pad_sparse_matrix
 12 | 
 13 | FEATURE_COLS = [
 14 |     "tok_id",
 15 |     "page",
 16 |     "x0",
 17 |     "y0",
 18 |     "length",
 19 |     "digitness",
 20 |     "is_dollar",
 21 |     "log_amount",
 22 | ]
 23 | NUM_FEATURES = len(FEATURE_COLS)
 24 | 
 25 | TOKEN_COLS = [
 26 |     "token",
 27 |     "x0",
 28 |     "y0",
 29 |     "x1",
 30 |     "y1",
 31 |     "page",
 32 |     # The following are "match %" for the known fields
 33 |     "contract_num",
 34 |     "advertiser",
 35 |     "flight_from",
 36 |     "flight_to",
 37 |     "gross_amount",
 38 | ]
 39 | 
 40 | 
 41 | # This sets which field the model is looking for.
 42 | SINGLE_CLASS_PREDICTION = "gross_amount"
 43 | 
 44 | 
 45 | @dataclass
 46 | class Window:
 47 |     """A Window just holds views to the arrays held by a Document."""
 48 | 
 49 |     tokens: pd.DataFrame
 50 |     features: np.ndarray
 51 |     labels: np.ndarray
 52 | 
 53 |     def __len__(self):
 54 |         return len(self.labels)
 55 | 
 56 | 
 57 | @dataclass(frozen=True)
 58 | class Document:
 59 |     slug: str
 60 |     # tokens, features, and labels are all aligned with the same indices.
 61 |     tokens: pd.DataFrame
 62 |     features: np.ndarray
 63 |     labels: np.ndarray
 64 |     # positive_windows is a list of which (starting) indices have a match.
 65 |     positive_windows: np.ndarray
 66 |     window_len: int
 67 |     label_values: dict[str, str]
 68 |     adjacency_matrix: sparse.coo_matrix
 69 | 
 70 |     def random_window(self, require_positive=False):
 71 |         if require_positive and len(self.positive_windows):
 72 |             index = np.random.choice(self.positive_windows)
 73 |         else:
 74 |             index = np.random.randint(len(self))
 75 |         return self[index]
 76 | 
 77 |     def __getitem__(self, n):
 78 |         """Return the `n`th window in the document."""
 79 |         k = n + self.window_len
 80 |         return Window(self.tokens.iloc[n:k], self.features[n:k], self.labels[n:k])
 81 | 
 82 |     def __len__(self):
 83 |         """Return the number of windows in the document.
 84 | 
 85 |         Note that unless window_len=1, this is less than the number of tokens.
 86 |         """
 87 |         return len(self.labels) - self.window_len + 1
 88 | 
 89 |     def __iter__(self):
 90 |         """Iterate over all windows in the document in order."""
 91 |         for i in range(len(self)):
 92 |             yield self[i]
 93 | 
 94 |     def predict_scores(self, model):
 95 |         """Use a model to predict labels for each of the document tokens."""
 96 |         windowed_features = np.stack([window.features for window in self])
 97 |         window_scores = model.predict(windowed_features)
 98 | 
 99 |         num_windows = len(self.labels)
100 |         scores = np.zeros((num_windows, len(TokenType)))
101 |         for i, window_score in enumerate(window_scores):
102 |             scores[i : i + self.window_len, :] += window_score / self.window_len
103 | 
104 |         return scores
105 | 
106 |     def predict_answer(self, model, threshold):
107 |         """Score each token and return all texts that exceed the threshold."""
108 |         # The first score column is how "irrelevant" a token is, so drop it.
109 |         scores = self.predict_scores(model)[:, 1:]
110 | 
111 |         score_texts, individual_scores = [], []
112 |         for column in scores.T:
113 |             text, score = best_token(column, self.tokens.token, threshold)
114 |             score_texts.append(text)
115 |             individual_scores.append(score)
116 | 
117 |         return score_texts, individual_scores, scores
118 | 
119 |     def show_predictions(self, pred_texts, pred_scores, scores):
120 |         """Predict token scores and print them alongside the tokens and true labels."""
121 |         title = f"======={self.slug}======="
122 |         predicted = "field (predicted / actual <score>):\n"
123 | 
124 |         df = pd.DataFrame({"token": self.tokens.token.str.slice(0, 20)})
125 |         df["label"] = [TokenType(x).name if x else "" for x in self.labels]
126 | 
127 |         for i, item in enumerate(self.label_values.items()):
128 |             name, value = item
129 |             x = "✔️" if any_match(pred_texts[i], value) else "❌"
130 |             predicted += f"\t{x}{name}: {pred_texts[i]} / {value} <{pred_scores[i]}>\n"
131 |             df[name] = [f"{'*' if s > 0.5 else ''} {s:0.5f}" for s in scores[:, i]]
132 | 
133 |         df = df.iloc[self.window_len - 1 : 1 - self.window_len]
134 |         return "\n".join([title, predicted, df.to_string()])
135 | 
136 |     @staticmethod
137 |     def from_parquet(slug, label_values, pq_path, graph_path, config):
138 |         """Load precomputed features from a parquet file and apply a config."""
139 |         df = pd.read_parquet(pq_path)
140 | 
141 |         df["tok_id"] = (
142 |             np.minimum(df["tok_id"], config.vocab_size - 1) * config.use_string
143 |         )
144 |         df["page"] *= config.use_page
145 |         df["x0"] *= config.use_geom
146 |         df["y0"] *= config.use_geom
147 |         df["log_amount"] *= config.use_amount
148 | 
149 |         adjacency = read_adjacency(graph_path) if config.use_adjacency_matrix else None
150 | 
151 |         if config.pad_windows:
152 |             df = pad_df(df, config.window_len - 1)
153 |             if adjacency is not None:
154 |                 adjacency = pad_adjacency(adjacency, config.window_len - 1)
155 |         fix_dtypes(df)
156 | 
157 |         # Pre-compute which windows have the desired token.
158 |         positive_windows = []
159 |         for i in range(len(df) - config.window_len):
160 |             if df["label"].iloc[i : i + config.window_len].any():
161 |                 positive_windows.append(i)
162 | 
163 |         # We're no longer requiring that there exists a correct answer.
164 |         # assert len(positive_windows) > 0
165 | 
166 |         return Document(
167 |             slug=slug,
168 |             tokens=df[TOKEN_COLS],
169 |             features=df[FEATURE_COLS].to_numpy(dtype=float),
170 |             labels=df["label"].to_numpy(dtype=int),
171 |             positive_windows=np.array(positive_windows),
172 |             window_len=config.window_len,
173 |             label_values=label_values,
174 |             adjacency_matrix=adjacency,
175 |         )
176 | 
177 | 
178 | def pad_df(df, num_rows):
179 |     """Add `num_rows` NaNs to the start and end of a DataFrame."""
180 |     if num_rows:
181 |         zeros = pd.DataFrame(index=pd.RangeIndex(num_rows))
182 |         return pd.concat([zeros, df, zeros]).reset_index(drop=True)
183 |     else:
184 |         return df
185 | 
186 | 
187 | def pad_adjacency(adjacency, num_rows):
188 |     """Add blank rows to the square adjacency matrix"""
189 |     if num_rows:
190 |         return pad_sparse_matrix(adjacency, num_rows, num_rows)
191 |     else:
192 |         return adjacency
193 | 
194 | 
195 | def actual_value(df, value_col, match_col):
196 |     """Return the best value from `value_col`, as evaluated by `match_col`."""
197 |     index = df[match_col].argmax()
198 |     return df.iloc[index][value_col]
199 | 
200 | 
201 | def best_token(scores, tokens, threshold):
202 |     # All runs of tokens where each token meets the threshold.
203 |     options = list(selected_tokens(scores, tokens, threshold))
204 |     if options:
205 |         # Take the text with the highest score.
206 |         score, text = list(sorted(options, key=lambda t: t[0] * len(t[1])))[-1]
207 |     else:
208 |         # No sequence meets the threshold, so choose the best single token.
209 |         text = tokens[np.argmax(scores)]
210 |         score = np.max(scores)
211 |     return text, score
212 | 
213 | 
214 | def selected_tokens(scores, tokens, threshold):
215 |     """Yield all consecutive runs of tokens where each token exceeds the threshold."""
216 |     current_strings, current_score, count = [], 0, 0
217 |     for s, t in zip(scores, tokens):
218 |         if s > threshold:
219 |             current_strings.append(t)
220 |             current_score += s
221 |             count += 1
222 |         elif count > 0:
223 |             yield current_score / count, " ".join(current_strings)
224 |             current_strings, current_score, count = [], 0, 0
225 |     if count > 0:
226 |         yield current_score / count, " ".join(current_strings)
227 | 


--------------------------------------------------------------------------------
/deepform/document_store.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import random
  3 | from dataclasses import dataclass
  4 | from pathlib import Path
  5 | 
  6 | import pandas as pd
  7 | from joblib import dump, load
  8 | from tqdm import tqdm
  9 | 
 10 | from deepform.data.add_features import LABEL_COLS, pq_index_and_dir
 11 | from deepform.document import Document
 12 | from deepform.logger import logger
 13 | 
 14 | 
 15 | @dataclass(frozen=True)
 16 | class DocumentStore:
 17 |     documents: list
 18 | 
 19 |     def __len__(self):
 20 |         return len(self.documents)
 21 | 
 22 |     def __iter__(self):
 23 |         for doc in self.documents:
 24 |             yield doc
 25 | 
 26 |     def __getitem__(self, n):
 27 |         """Return the pre-processed tokens for a specified document."""
 28 |         return self.documents[n]
 29 | 
 30 |     def random_document(self):
 31 |         return random.choice(self.documents)
 32 | 
 33 |     def sample(self, n=None):
 34 |         if n is None:
 35 |             n = len(self)
 36 |         return DocumentStore(random.sample(self.documents, k=n))
 37 | 
 38 |     def split(self, val_percent=0.2):
 39 |         """Divide into two DocumentStores, e.g. a training and a validation set."""
 40 |         docs_copy = copy.deepcopy(self.documents)
 41 |         random.shuffle(docs_copy)
 42 |         split_index = int(val_percent * len(self))
 43 |         return DocumentStore(docs_copy[:split_index]), DocumentStore(
 44 |             docs_copy[split_index:]
 45 |         )
 46 | 
 47 |     @staticmethod
 48 |     def open(index_file, config):
 49 |         """Load the documents referenced by `index_file` and apply `config`."""
 50 |         index_file = Path(index_file)
 51 |         doc_index = pd.read_parquet(index_file)
 52 |         logger.info(f"{len(doc_index)} documents in index")
 53 | 
 54 |         if not config.pad_windows:
 55 |             # Filter out documents that are too short for the curent config.
 56 |             doc_index = doc_index[doc_index["length"] >= config.window_len]
 57 | 
 58 |         # Filter out documents that don't have a sufficiently high match.
 59 |         # doc_index = doc_index[doc_index["best_match"] >= config.target_thresh]
 60 |         logger.info(f"After applying config {len(doc_index)} documents are available")
 61 | 
 62 |         # Sample down to no more than the requested number of documents.
 63 |         num_docs = min(config.len_train, len(doc_index))
 64 |         doc_index = doc_index.sample(n=num_docs)
 65 | 
 66 |         # Load each of the documents, finishing any necessary feature computation.
 67 |         slug_to_doc = caching_doc_getter(index_file, config)
 68 |         # docs = concurrent.thread_map(slug_to_doc, doc_index["slug"])
 69 | 
 70 |         labels = doc_index[LABEL_COLS.keys()]
 71 |         docs = [
 72 |             slug_to_doc(slug, labels.loc[slug])
 73 |             for slug in tqdm(doc_index.index, desc="Creating docs")
 74 |         ]
 75 |         docs = [d for d in docs if d != None]  # noqa: E711
 76 | 
 77 |         return DocumentStore(docs)
 78 | 
 79 | 
 80 | def caching_doc_getter(index_file, config):
 81 |     _, pq_root = pq_index_and_dir(index_file)
 82 |     if config.use_data_cache:
 83 |         cache_root = pq_root.parent / "cache" / cache_master_key(config)
 84 |         cache_root.mkdir(parents=True, exist_ok=True)
 85 | 
 86 |     def slug_to_doc(slug, labels):
 87 |         pq_path = pq_root / f"{slug}.parquet"
 88 |         graph_path = pq_root / f"{slug}.graph"
 89 |         if config.use_data_cache:
 90 |             cache_path = cache_root / f"{slug}.joblib"
 91 |             try:
 92 |                 with open(cache_path, "rb") as infile:
 93 |                     return load(infile)
 94 |             except FileNotFoundError:
 95 |                 logger.debug(f"Cache file {cache_path} not found")
 96 |         try:
 97 |             doc = Document.from_parquet(slug, labels, pq_path, graph_path, config)
 98 |         except AssertionError:
 99 |             logger.warning(f"No correct answers for {slug}, skipping")
100 |             return None
101 |         if config.use_data_cache:
102 |             with open(cache_path, "wb") as outfile:
103 |                 dump(doc, outfile)
104 |             logger.debug(f"Wrote document to cache file {cache_path}")
105 |         return doc
106 | 
107 |     return slug_to_doc
108 | 
109 | 
110 | def cache_master_key(config):
111 |     """Create a string determined by any cache-invalidating config elements."""
112 |     return (
113 |         "str{use_string}_"
114 |         "vocab{vocab_size}_"
115 |         "pg{use_page}_"
116 |         "geom{use_geom}_"
117 |         "amt{use_amount}_"
118 |         "pad{pad_windows}_"
119 |         "len{window_len}"
120 |     ).format(**config)
121 | 


--------------------------------------------------------------------------------
/deepform/features.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | STRING_COLS = ["slug", "token"]
 4 | INT_COLS = ["tok_id", "length", "label"]
 5 | FLOAT_COLS = [
 6 |     "page",
 7 |     "x0",
 8 |     "y0",
 9 |     "x1",
10 |     "y1",
11 |     "gross_amount",
12 |     "match",
13 |     "digitness",
14 |     "log_amount",
15 | ]
16 | BOOL_COLS = ["is_dollar"]
17 | 
18 | 
19 | def fix_type(df, col, na_value, dtype, downcast=False):
20 |     if col not in df.columns:
21 |         return
22 |     df[col] = df[col].fillna(na_value).astype(dtype)
23 |     if downcast:
24 |         try:
25 |             df[col] = pd.to_numeric(df[col], downcast=dtype)
26 |         except ValueError:
27 |             print(f"Unable to downcast column {col} as {dtype}")
28 |             print(df[col])
29 | 
30 | 
31 | def fix_dtypes(df):
32 |     # Use new-style Pandas string types.
33 |     for col in STRING_COLS:
34 |         fix_type(df, col, na_value="", dtype="string")
35 | 
36 |     for col in BOOL_COLS:
37 |         fix_type(df, col, na_value=0, dtype="bool")
38 | 
39 |     for col in INT_COLS:
40 |         fix_type(df, col, na_value=0, dtype="int")
41 | 
42 |     for col in FLOAT_COLS:
43 |         fix_type(df, col, na_value=0.0, dtype="float", downcast=True)
44 | 


--------------------------------------------------------------------------------
/deepform/infer.py:
--------------------------------------------------------------------------------
  1 | """Use a model to infer predicted values for a document."""
  2 | 
  3 | 
  4 | import argparse
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | from deepform.common import DATA_DIR, TOKEN_DIR
 11 | from deepform.data.add_features import TokenType, add_base_features, as_completed
 12 | from deepform.data.tokenize_pdfs import extract_doc
 13 | from deepform.document import FEATURE_COLS, Document, pad_df
 14 | from deepform.features import fix_dtypes
 15 | from deepform.model import load_model
 16 | 
 17 | 
 18 | def infer_from_pdf(pdf_path, model=None, window_len=None):
 19 |     """Extract features from a PDF and run infrence on it."""
 20 |     if not model:
 21 |         model, window_len = load_model()
 22 |     if not window_len:
 23 |         raise Exception("No window_len param provided or inferrable")
 24 | 
 25 |     doc = extract_doc(pdf_path, window_len)
 26 | 
 27 |     best_score_texts, individual_scores, _ = doc.predict_answer(model)
 28 | 
 29 |     # TODO: clean up the column name from the token type enum
 30 |     predictions = {
 31 |         str(column.name.lower()): {"prediction": text, "score": score}
 32 |         for text, score, column in zip(
 33 |             best_score_texts, individual_scores, np.array(TokenType)[1:]
 34 |         )
 35 |     }
 36 | 
 37 |     return predictions
 38 | 
 39 | 
 40 | def predict(token_file, model, window_len):
 41 |     slug = token_file.stem
 42 |     doc = tokens_to_doc(token_file, window_len)
 43 | 
 44 |     predict_texts, predict_scores, _ = doc.predict_answer(model, 0.5)
 45 |     fields = [tt.name.lower() for tt in TokenType if tt.value > 0]
 46 |     predictions = []
 47 |     for field, text, score in zip(fields, predict_texts, predict_scores):
 48 |         predictions.append({"slug": slug, "field": field, "text": text, "score": score})
 49 |     return pd.DataFrame(predictions)
 50 | 
 51 | 
 52 | def predict_many(token_files, model_file):
 53 |     model, window_len = load_model(args.model)
 54 |     return pd.concat(predict(t, model, window_len) for t in token_files)
 55 | 
 56 | 
 57 | def tokens_to_doc(token_file, window_len=25):
 58 |     """Create a Document with features extracted from a pdf."""
 59 |     tokens = pd.read_parquet(token_file)
 60 |     # Remove tokens shorter than three characters.
 61 |     df = tokens[tokens["token"].str.len() >= 3]
 62 |     df = add_base_features(df)
 63 |     df["tok_id"] = np.minimum(511, df["tok_id"])
 64 |     df = pad_df(df, window_len - 1)
 65 |     fix_dtypes(df)
 66 |     return Document(
 67 |         slug=token_file.stem,
 68 |         tokens=df,
 69 |         features=df[FEATURE_COLS].to_numpy(dtype=float),
 70 |         labels=np.zeros(len(df), dtype=bool),  # Dummy.
 71 |         positive_windows=np.array(0),  # Dummy.
 72 |         window_len=window_len,
 73 |         label_values={},
 74 |     )
 75 | 
 76 | 
 77 | if __name__ == "__main__":
 78 |     parser = argparse.ArgumentParser(description=__doc__)
 79 |     parser.add_argument(
 80 |         "-m", "--model", dest="model", help="model file to use in prediction"
 81 |     )
 82 |     args = parser.parse_args()
 83 | 
 84 |     manifest = pd.read_csv(DATA_DIR / "fcc-data-2020-labeled-manifest.csv")
 85 |     slugs = set(manifest.file_id)
 86 |     token_files = [t for t in TOKEN_DIR.glob("*.parquet") if t.stem in slugs]
 87 |     token_files.sort()
 88 | 
 89 |     # Spin up a bunch of jobs to do the conversion
 90 |     with ThreadPoolExecutor() as executor:
 91 |         doc_jobs = []
 92 |         for i in range(0, len(token_files), 100):
 93 |             batch = token_files[i : i + 100]
 94 |             doc_jobs.append(executor.submit(predict_many, batch, args.model))
 95 | 
 96 |         doc_results = []
 97 |         for p in as_completed(doc_jobs):
 98 |             result = p.result()
 99 |             doc_results.append(result)
100 |             print(result.to_string())
101 | 
102 |     results = pd.concat(doc_results).reset_index(drop=True)
103 |     results.to_csv("predict_on_known.csv", index=False)
104 | 


--------------------------------------------------------------------------------
/deepform/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger(__name__)
4 | 


--------------------------------------------------------------------------------
/deepform/model.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from datetime import datetime
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | from tensorflow.keras.layers import (
  9 |     Dense,
 10 |     Dropout,
 11 |     Embedding,
 12 |     Flatten,
 13 |     Lambda,
 14 |     Reshape,
 15 |     Softmax,
 16 |     concatenate,
 17 | )
 18 | from tensorflow.keras.models import Model
 19 | 
 20 | from deepform.common import MODEL_DIR
 21 | from deepform.data.add_features import TokenType
 22 | from deepform.document import NUM_FEATURES
 23 | from deepform.util import git_short_hash
 24 | 
 25 | 
 26 | # control the fraction of windows that include a positive label. not efficient.
 27 | def one_window(dataset, config):
 28 |     require_positive = random.random() > config.positive_fraction
 29 |     window = dataset.random_document().random_window(require_positive)
 30 |     if config.permute_tokens:
 31 |         shuffle = np.random.permutation(config.window_len)
 32 |         window.features = window.features[shuffle]
 33 |         window.labels = window.labels[shuffle]
 34 |     return window
 35 | 
 36 | 
 37 | def windowed_generator(dataset, config):
 38 |     # Create empty arrays to contain batch of features and labels#
 39 |     batch_features = np.zeros((config.batch_size, config.window_len, NUM_FEATURES))
 40 |     batch_labels = np.zeros((config.batch_size, config.window_len))
 41 | 
 42 |     while True:
 43 |         for i in range(config.batch_size):
 44 |             window = one_window(dataset, config)
 45 |             batch_features[i, :, :] = window.features
 46 |             batch_labels[i, :] = window.labels  # tf.one_hot(window.labels, 2)
 47 |         yield batch_features, batch_labels
 48 | 
 49 | 
 50 | # ---- Custom loss function is basically MSE but high penalty for missing a 1 label ---
 51 | def missed_token_loss(one_penalty):
 52 |     def _missed_token_loss(y_true, y_pred):
 53 |         expected_zero = tf.cast(tf.math.equal(y_true, 0), tf.float32)
 54 |         s = y_pred * expected_zero
 55 |         zero_loss = tf.keras.backend.mean(tf.keras.backend.square(s))
 56 |         expected_one = tf.cast(tf.math.equal(y_true, 1), tf.float32)
 57 |         t = one_penalty * (1 - y_pred) * expected_one
 58 |         one_loss = tf.keras.backend.mean(tf.keras.backend.square(t))
 59 |         return zero_loss + one_loss
 60 | 
 61 |     return _missed_token_loss  # closes over one_penalty
 62 | 
 63 | 
 64 | # --- Specify network ---
 65 | def create_model(config):
 66 |     indata = tf.keras.Input((config.window_len, NUM_FEATURES))
 67 | 
 68 |     # split into the hash and the rest of the token features, embed hash as
 69 |     # one-hot, then merge
 70 |     def create_tok_hash(x):
 71 |         import tensorflow as tf
 72 | 
 73 |         return tf.squeeze(tf.slice(x, (0, 0, 0), (-1, -1, 1)), axis=2)
 74 | 
 75 |     def create_tok_features(x):
 76 |         import tensorflow as tf
 77 | 
 78 |         return tf.slice(x, (0, 0, 1), (-1, -1, -1))
 79 | 
 80 |     tok_hash = Lambda(create_tok_hash)(indata)
 81 |     tok_features = Lambda(create_tok_features)(indata)
 82 |     embed = Embedding(config.vocab_size, config.vocab_embed_size)(tok_hash)
 83 |     merged = concatenate([embed, tok_features], axis=2)
 84 | 
 85 |     f = Flatten()(merged)
 86 |     d1 = Dense(
 87 |         int(config.window_len * NUM_FEATURES * config.layer_1_size_factor),
 88 |         activation="sigmoid",
 89 |     )(f)
 90 |     d2 = Dropout(config.dropout)(d1)
 91 |     d3 = Dense(
 92 |         int(config.window_len * NUM_FEATURES * config.layer_2_size_factor),
 93 |         activation="sigmoid",
 94 |     )(d2)
 95 |     d4 = Dropout(config.dropout)(d3)
 96 | 
 97 |     if config.num_layers == 3:
 98 |         d5 = Dense(
 99 |             int(config.window_len * NUM_FEATURES * config.layer_3_size_factor),
100 |             activation="sigmoid",
101 |         )(d4)
102 |         last_layer = Dropout(config.dropout)(d5)
103 |     else:
104 |         last_layer = d4
105 | 
106 |     preout = Dense(config.window_len * len(TokenType), activation="linear")(last_layer)
107 |     shaped = Reshape((config.window_len, len(TokenType)))(preout)
108 |     outdata = Softmax(axis=-1)(shaped)
109 |     model = Model(inputs=[indata], outputs=[outdata])
110 | 
111 |     # _missed_token_loss = missed_token_loss(config.penalize_missed)
112 | 
113 |     model.compile(
114 |         optimizer=tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
115 |         loss=tf.keras.losses.SparseCategoricalCrossentropy(),
116 |         metrics=["acc"],
117 |     )
118 | 
119 |     return model
120 | 
121 | 
122 | def default_model_name(window_len):
123 |     timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
124 |     return MODEL_DIR / f"{timestamp}_{git_short_hash()}_{window_len}.model"
125 | 
126 | 
127 | def latest_model():
128 |     models = MODEL_DIR.glob("*.model")
129 |     return max(models, key=lambda p: p.stat().st_ctime)
130 | 
131 | 
132 | def load_model(model_file=None):
133 |     filepath = Path(model_file) if model_file else latest_model()
134 |     window_len = int(filepath.stem.split("_")[-1])
135 |     model = keras.models.load_model(
136 |         filepath, custom_objects={"_missed_token_loss": missed_token_loss(5)}
137 |     )
138 |     return model, window_len
139 | 
140 | 
141 | def save_model(model, config):
142 |     basename = (
143 |         Path(config.model_path)
144 |         if config.model_path
145 |         else default_model_name(config.window_len)
146 |     )
147 |     basename.parent.mkdir(parents=True, exist_ok=True)
148 |     model.save(basename)
149 |     return basename
150 | 


--------------------------------------------------------------------------------
/deepform/pdfs.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor
  2 | from decimal import Decimal
  3 | 
  4 | import boto3
  5 | import numpy as np
  6 | import pdfplumber
  7 | import wandb
  8 | from botocore import UNSIGNED
  9 | from botocore.config import Config
 10 | from botocore.exceptions import ClientError
 11 | from tqdm import tqdm
 12 | 
 13 | from deepform.common import PDF_DIR, S3_BUCKET
 14 | from deepform.document import SINGLE_CLASS_PREDICTION
 15 | from deepform.logger import logger
 16 | from deepform.util import docrow_to_bbox, dollar_match, wandb_bbox
 17 | 
 18 | 
 19 | def get_pdf_path(slug):
 20 |     """Return a path to the pdf with the given slug, downloading the file if necessary.
 21 | 
 22 |     If the pdf isn't in the local file system, download it from an external repository.
 23 |     """
 24 |     filename = slug + ("" if slug.endswith(".pdf") else ".pdf")
 25 |     location = PDF_DIR / filename
 26 |     if not location.is_file():
 27 |         PDF_DIR.mkdir(parents=True, exist_ok=True)
 28 |         download_from_remote(location)
 29 |     return location
 30 | 
 31 | 
 32 | def get_pdf_paths(slugs):
 33 |     with ThreadPoolExecutor() as executor:
 34 |         print(f"Getting {len(slugs):,} pdfs...")
 35 |         for path in tqdm(executor.map(get_pdf_path, slugs), total=len(slugs)):
 36 |             yield path
 37 | 
 38 | 
 39 | def download_from_remote(local_path):
 40 |     """Copy a pdf from S3 into the local filesystem."""
 41 |     filename = local_path.name
 42 |     s3_key = "pdfs/" + filename
 43 |     s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
 44 |     try:
 45 |         s3.Bucket(S3_BUCKET).download_file(s3_key, str(local_path))
 46 |     except ClientError:
 47 |         logger.error(f"Unable to retrieve {s3_key} from s3://{S3_BUCKET}")
 48 |         raise
 49 | 
 50 | 
 51 | def log_wandb_pdfs(doc, doc_log, all_scores):
 52 |     fname = get_pdf_path(doc.slug)
 53 |     try:
 54 |         pdf = pdfplumber.open(fname)
 55 |     except Exception:
 56 |         # If the file's not there, that's fine -- we use available PDFs to
 57 |         # define what to see
 58 |         logger.warn(f"Cannot open pdf {fname}")
 59 |         return
 60 | 
 61 |     logger.info(f"Rendering output for {fname}")
 62 | 
 63 |     # map class labels for visualizing W&B bounding boxes
 64 |     # TODO: use a type and separate out ground truth
 65 |     class_ids_by_field = {
 66 |         "gross_amount": 0,
 67 |         "flight_to": 1,
 68 |         "flight_from": 2,
 69 |         "contract_num": 3,
 70 |         "advertiser": 4,
 71 |         "ground_truth": 5,
 72 |     }
 73 |     class_id_to_label = {int(v): k for k, v in class_ids_by_field.items()}
 74 | 
 75 |     # visualize the first page of the document for which we have ground truth labels
 76 |     pagenum = int(doc.tokens[doc.labels > 0].page.min())
 77 |     page = pdf.pages[pagenum]
 78 |     im = page.to_image(resolution=300)
 79 | 
 80 |     # loop over all predictions
 81 |     pred_bboxes = []
 82 |     for i, score in enumerate(doc_log["score"]):
 83 |         rel_score = all_scores[:, i] / score
 84 |         page_match = doc.tokens.page == pagenum
 85 |         curr_field = doc_log["field"][i]
 86 | 
 87 |         # we could remove this threshold and rely entirely
 88 |         # on the wandb bbox dynamic threshold
 89 |         for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples():
 90 |             pred_bboxes.append(
 91 |                 wandb_bbox(
 92 |                     token,
 93 |                     score,
 94 |                     class_ids_by_field[curr_field],
 95 |                     im,
 96 |                 )
 97 |             )
 98 |     # draw target tokens
 99 |     target_toks = doc.tokens[(doc.labels > 0) & (doc.tokens.page == 0)]
100 |     true_bboxes = [wandb_bbox(t, 1, 5, im) for t in target_toks.itertuples()]
101 | 
102 |     boxes = {
103 |         "predictions": {
104 |             "box_data": pred_bboxes,
105 |             "class_labels": class_id_to_label,
106 |         },
107 |         "ground_truth": {
108 |             "box_data": true_bboxes,
109 |             "class_labels": class_id_to_label,
110 |         },
111 |     }
112 |     wandb.log({f"pdf/{fname.name}:{pagenum}": wandb.Image(im.annotated, boxes=boxes)})
113 | 
114 | 
115 | def render_tokenized_pdf(doc):
116 | 
117 |     fname = get_pdf_path(doc.slug)
118 |     try:
119 |         pdf = pdfplumber.open(fname)
120 |     except Exception:
121 |         # If the file's not there, that's fine -- we use available PDFs to
122 |         # define what to see
123 |         print(f"Cannot open pdf {fname}")
124 |         return
125 | 
126 |     page_images = [
127 |         {"image": page.to_image(resolution=300), "rects": [], "lines": []}
128 |         for page in pdf.pages
129 |     ]
130 | 
131 |     for token in doc.tokens.itertuples():
132 |         page_num = int(token.page)
133 |         if page_num < len(page_images):
134 |             page_images[page_num]["rects"].append(docrow_to_bbox(token))
135 | 
136 |     for indices in np.argwhere(doc.adjacency_matrix):
137 |         first_index, second_index = indices
138 |         if first_index != second_index:
139 |             first_token = doc.tokens.iloc[first_index]
140 |             second_token = doc.tokens.iloc[second_index]
141 |             page = int(first_token.page)
142 |             line = (
143 |                 (Decimal(float(first_token.x0)), Decimal(float(first_token.y1))),
144 |                 (Decimal(float(second_token.x0)), Decimal(float(second_token.y1))),
145 |             )
146 |             page_images[page_num]["lines"].append(line)
147 | 
148 |     for page in page_images:
149 |         image, rects, lines = page["image"], page["rects"], page["lines"]
150 |         image.draw_rects(rects, stroke="blue", stroke_width=3, fill=None)
151 |         print(f"first lines = {lines[:5]}")
152 |         image.draw_lines(lines, stroke="green", stroke_width=3)
153 | 
154 |     return [page["image"] for page in page_images]
155 | 
156 | 
157 | def render_annotated_pdf(doc, score, scores, predict_text, answer_text):
158 | 
159 |     fname = get_pdf_path(doc.slug)
160 |     try:
161 |         pdf = pdfplumber.open(fname)
162 |     except Exception:
163 |         # If the file's not there, that's fine -- we use available PDFs to
164 |         # define what to see
165 |         print(f"Cannot open pdf {fname}")
166 |         return
167 | 
168 |     print(f"Rendering output for {fname}")
169 | 
170 |     # Get the correct answers: find the indices of the token(s) labelled 1
171 |     target_idx = [idx for (idx, val) in enumerate(doc.labels) if val == 1]
172 | 
173 |     # Draw the machine output: get a score for each token
174 |     page_images = []
175 |     for pagenum, page in enumerate(pdf.pages):
176 |         im = page.to_image(resolution=300)
177 | 
178 |         # training data has 0..1 for page range (see create-training-data.py)
179 |         num_pages = len(pdf.pages)
180 |         if num_pages > 1:
181 |             current_page = pagenum / float(num_pages - 1)
182 |         else:
183 |             current_page = 0.0
184 | 
185 |         # Draw guesses
186 |         rel_score = scores / score
187 |         page_match = np.isclose(doc.tokens["page"], current_page)
188 |         for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples():
189 |             if rel_score[token.Index] == 1:
190 |                 w = 5
191 |                 s = "magenta"
192 |             elif rel_score[token.Index] >= 0.75:
193 |                 w = 3
194 |                 s = "red"
195 |             else:
196 |                 w = 1
197 |                 s = "red"
198 |             im.draw_rect(docrow_to_bbox(token), stroke=s, stroke_width=w, fill=None)
199 | 
200 |         # Draw target tokens
201 |         target_toks = [
202 |             doc.tokens.iloc[i]
203 |             for i in target_idx
204 |             if np.isclose(doc.tokens.iloc[i]["page"], current_page)
205 |         ]
206 |         rects = [docrow_to_bbox(t) for t in target_toks]
207 |         im.draw_rects(rects, stroke="blue", stroke_width=3, fill=None)
208 |         page_images.append({"caption": f"page {pagenum}", "image": im.annotated})
209 | 
210 |     # get best matching score of any token in the training data
211 |     match = doc.tokens[SINGLE_CLASS_PREDICTION].max()
212 |     caption = (
213 |         f"{doc.slug} guessed:{predict_text} answer:{answer_text} match:{match:.2f}"
214 |     )
215 |     verdict = dollar_match(predict_text, answer_text)
216 | 
217 |     if dollar_match(predict_text, answer_text):
218 |         caption = "CORRECT " + caption
219 |     else:
220 |         caption = "INCORRECT " + caption
221 |     return verdict, caption, page_images
222 | 
223 | 
224 | def log_pdf(doc, score, scores, predict_text, answer_text):
225 |     caption, page_images = render_annotated_pdf(doc, score, predict_text, answer_text)
226 |     page_images = [
227 |         wandb.Image(page_image["image"], page_image["caption"])
228 |         for page_image in page_images
229 |     ]
230 |     wandb.log({caption: page_images})
231 | 


--------------------------------------------------------------------------------
/deepform/train.py:
--------------------------------------------------------------------------------
  1 | # Data extraction by deep learning, using a fully connected architecture over
  2 | # token windows. Engineered to extract total amounts, using a few custom
  3 | # features.
  4 | # Achieves up to 90% accuracy.
  5 | #
  6 | # jstray 2019-6-12
  7 | 
  8 | import argparse
  9 | import os
 10 | import re
 11 | from collections import defaultdict
 12 | from datetime import datetime
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | import tensorflow as tf
 17 | import wandb
 18 | from tensorflow import keras as K
 19 | from wandb.keras import WandbCallback
 20 | 
 21 | from deepform.common import LOG_DIR, TRAINING_INDEX, WANDB_PROJECT
 22 | from deepform.data.add_features import LABEL_COLS
 23 | from deepform.document_store import DocumentStore
 24 | from deepform.logger import logger
 25 | from deepform.model import create_model, save_model, windowed_generator
 26 | from deepform.pdfs import log_wandb_pdfs
 27 | from deepform.util import config_desc, date_match, dollar_match, loose_match
 28 | 
 29 | 
 30 | # Calculate accuracy of answer extraction over num_to_test docs, print
 31 | # diagnostics while we do so
 32 | def compute_accuracy(model, config, dataset, num_to_test, print_results, log_path):
 33 |     n_print = config.render_results_size
 34 | 
 35 |     n_docs = min(num_to_test, len(dataset))
 36 | 
 37 |     accuracies = defaultdict(int)
 38 | 
 39 |     for doc in sorted(dataset.sample(n_docs), key=lambda d: d.slug):
 40 |         slug = doc.slug
 41 |         answer_texts = doc.label_values
 42 | 
 43 |         predict_texts, predict_scores, all_scores = doc.predict_answer(
 44 |             model, config.predict_thresh
 45 |         )
 46 |         answer_texts = [answer_texts[c] for c in LABEL_COLS.keys()]
 47 | 
 48 |         doc_output = doc.show_predictions(predict_texts, predict_scores, all_scores)
 49 |         # path = log_path / ("right" if match else "wrong")
 50 |         log_path.mkdir(parents=True, exist_ok=True)
 51 |         with open(log_path / f"{slug}.txt", "w") as predict_file:
 52 |             predict_file.write(doc_output)
 53 | 
 54 |         if print_results:
 55 |             print(f"file_id:{slug}")
 56 | 
 57 |         # track all logging information for this document
 58 |         doc_log = defaultdict(list)
 59 |         for i, (field, answer_text) in enumerate(doc.label_values.items()):
 60 |             predict_text = predict_texts[i]
 61 |             predict_score = predict_scores[i]
 62 |             doc_log["true_text"].append(answer_text)
 63 |             doc_log["pred_text"].append(predict_text)
 64 |             doc_log["score"].append(predict_score)
 65 |             doc_log["field"].append(field)
 66 | 
 67 |             match = (
 68 |                 (predict_score < config.predict_thresh and not answer_text)
 69 |                 or loose_match(predict_text, answer_text)
 70 |                 or (field == "gross_amount" and dollar_match(predict_text, answer_text))
 71 |                 or (
 72 |                     field in ("flight_from", "flight_to")
 73 |                     and date_match(predict_text, answer_text)
 74 |                 )
 75 |             )
 76 | 
 77 |             accuracies[field] += match
 78 | 
 79 |             prefix = "✔️" if match else "❌"
 80 |             guessed = f'guessed "{predict_text}" with score {predict_score:.3f}'
 81 |             correction = "" if match else f', was actually "{answer_text}"'
 82 |             doc_log["match"].append(match)
 83 |             if print_results:
 84 |                 print(f"\t{prefix} {field}: {guessed}{correction}")
 85 |         if print_results and n_print > 0:
 86 |             log_wandb_pdfs(
 87 |                 doc, doc_log, all_scores
 88 |             )  # TODO: get fields here more explicitly?
 89 |             n_print -= 1
 90 |     return pd.Series(accuracies) / n_docs
 91 | 
 92 | 
 93 | # ---- Custom callback to log document-level accuracy ----
 94 | class DocAccCallback(K.callbacks.Callback):
 95 |     def __init__(self, config, run_timestamp, dataset, logname):
 96 |         self.config = config
 97 |         self.dataset = dataset
 98 |         self.logname = logname
 99 |         self.log_path = LOG_DIR / "predictions" / run_timestamp
100 | 
101 |     def on_epoch_end(self, epoch, logs):
102 |         if epoch >= self.config.epochs - 1:
103 |             # last epoch, sample from all docs and print inference results
104 |             print_results = self.logname == "doc_val_acc"
105 |             test_size = len(self.dataset)
106 |         else:
107 |             # intermediate epoch, small sample and no logger
108 |             print_results = False
109 |             test_size = self.config.doc_acc_sample_size + epoch
110 | 
111 |         # Avoid sampling tens of thousands of documents on large training sets.
112 |         test_size = min(test_size, self.config.doc_acc_max_sample_size)
113 | 
114 |         kind = "test" if self.logname == "doc_val_acc" else "train"
115 | 
116 |         acc = compute_accuracy(
117 |             self.model,
118 |             self.config,
119 |             self.dataset,
120 |             test_size,
121 |             print_results,
122 |             self.log_path / kind / f"{epoch:02d}",
123 |         )
124 |         acc_str = re.sub(r"\s+", " ", acc.to_string())
125 |         print(f"This epoch {self.logname}: {acc_str}")
126 | 
127 |         # convert field names for benchmark logging
128 |         wandb.log(
129 |             acc.rename(
130 |                 {"gross_amount": "amount", "contract_num": "contractid"}
131 |             ).to_dict()
132 |         )
133 | 
134 |         # compute average accuracy
135 |         wandb.log({"avg_acc": acc.mean(), "epoch": epoch})
136 | 
137 | 
138 | def main(config):
139 |     config.name = config_desc(config)
140 |     if config.use_wandb:
141 |         run.save()
142 | 
143 |     # set random seed
144 |     tf.random.set_seed(config.random_seed)
145 |     # also set numpy seed to control train/val dataset split
146 |     np.random.seed(config.random_seed)
147 | 
148 |     print("Configuration:")
149 |     print("{\n\t" + ",\n\t".join(f"'{k}': {v}" for k, v in config.items()) + "\n}")
150 | 
151 |     run_ts = datetime.now().isoformat(timespec="seconds").replace(":", "")
152 | 
153 |     # all_data = load_training_data(config)
154 |     all_documents = DocumentStore.open(index_file=TRAINING_INDEX, config=config)
155 | 
156 |     # split into validation and training sets
157 |     validation_set, training_set = all_documents.split(val_percent=config.val_split)
158 |     print(f"Training on {len(training_set)}, validating on {len(validation_set)}")
159 | 
160 |     model = create_model(config)
161 |     print(model.summary())
162 | 
163 |     callbacks = [WandbCallback()] if config.use_wandb else []
164 |     callbacks.append(K.callbacks.LambdaCallback(on_epoch_end=lambda *args: print()))
165 |     callbacks.append(DocAccCallback(config, run_ts, training_set, "doc_train_acc"))
166 |     callbacks.append(DocAccCallback(config, run_ts, validation_set, "doc_val_acc"))
167 | 
168 |     model.fit(
169 |         windowed_generator(training_set, config),
170 |         steps_per_epoch=config.steps_per_epoch,
171 |         epochs=config.epochs,
172 |         callbacks=callbacks,
173 |     )
174 | 
175 |     if config.save_model:
176 |         model_filepath = save_model(model, config)
177 |         alias = model_filepath.name
178 |         artifact = wandb.Artifact(
179 |             "deepform-model", type="model", metadata={"name": alias}
180 |         )
181 |         artifact.add_dir(
182 |             str(model_filepath)
183 |         )  # TODO: check that this is necessary? What does wandb api expect here?
184 |         run.log_artifact(artifact, aliases=["latest", alias])
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     # First read in the initial configuration.
189 |     os.environ["WANDB_CONFIG_PATHS"] = "config-defaults.yaml"
190 |     run = wandb.init(
191 |         project=WANDB_PROJECT,
192 |         job_type="train",
193 |         allow_val_change=True,
194 |     )
195 |     config = run.config
196 |     # Then override it with any parameters passed along the command line.
197 |     parser = argparse.ArgumentParser()
198 | 
199 |     # Anything in the config is fair game to be overridden by a command line flag.
200 |     for key, value in config.items():
201 |         cli_flag = f"--{key}".replace("_", "-")
202 |         parser.add_argument(cli_flag, dest=key, type=type(value), default=value)
203 | 
204 |     args = parser.parse_args()
205 |     config.update(args, allow_val_change=True)
206 | 
207 |     if not config.use_wandb:
208 |         os.environ["WANDB_SILENT"] = "true"
209 |         os.environ["WANDB_MODE"] = "dryrun"
210 |         wandb.log = lambda *args, **kwargs: None
211 | 
212 |     logger.setLevel(config.log_level)
213 | 
214 |     main(config)
215 | 


--------------------------------------------------------------------------------
/deepform/util.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | import re
  4 | import subprocess
  5 | from collections import namedtuple
  6 | from datetime import datetime
  7 | from decimal import Decimal, InvalidOperation
  8 | 
  9 | import scipy.sparse as sparse
 10 | from fuzzywuzzy import fuzz
 11 | 
 12 | from deepform.logger import logger
 13 | 
 14 | BoundingBox = namedtuple("BoundingBox", ["x0", "y0", "x1", "y1"])
 15 | 
 16 | _whitespace = re.compile(r"\s")
 17 | 
 18 | 
 19 | def simple_string(s):
 20 |     """Lowercase and remove whitespace from a string."""
 21 |     return _whitespace.sub("", s.casefold()) if isinstance(s, str) else ""
 22 | 
 23 | 
 24 | def num_digits(s):
 25 |     return sum(c.isdigit() for c in s)
 26 | 
 27 | 
 28 | def loose_match(s1, s2):
 29 |     """Match two strings irrespective of case and whitespace."""
 30 |     return simple_string(s1) == simple_string(s2)
 31 | 
 32 | 
 33 | def default_similarity(lhs, rhs):
 34 |     return fuzz.ratio(simple_string(lhs), simple_string(rhs)) / 100
 35 | 
 36 | 
 37 | def is_dollar_amount(s):
 38 |     try:
 39 |         return num_digits(s) > 0 and bool(re.match(r"^\$?\d*(,\d\d\d)*(\.\d\d)?$", s))
 40 |     except TypeError:
 41 |         return False
 42 | 
 43 | 
 44 | def dollar_amount(s):
 45 |     if is_dollar_amount(s):
 46 |         try:
 47 |             return float(s.replace("$", "").replace(",", ""))
 48 |         except ValueError:
 49 |             logger.error(f"'{s}' could not be converted to a dollar amount.")
 50 |     return None
 51 | 
 52 | 
 53 | def dollar_similarity(lhs, rhs):
 54 |     lh_dollar, rh_dollar = normalize_dollars(lhs), normalize_dollars(rhs)
 55 |     if lh_dollar and rh_dollar:
 56 |         return fuzz.ratio(lh_dollar, rh_dollar) / 100
 57 |     return default_similarity(lhs, rhs)
 58 | 
 59 | 
 60 | def log_dollar_amount(s):
 61 |     """Return the logarithm of 1 + a non-negative dollar amount."""
 62 |     d = dollar_amount(s)
 63 |     return math.log(d + 1) if d and d > 0 else None
 64 | 
 65 | 
 66 | def normalize_dollars(s) -> str:
 67 |     """Return a string of a number rounded to two digits (or None if not possible).
 68 | 
 69 |     Given a string like '$56,333.1' return the string '56333.10'.
 70 |     """
 71 |     try:
 72 |         return str(round(Decimal(str(s).replace("$", "").replace(",", "")), 2))
 73 |     except InvalidOperation:
 74 |         return None
 75 | 
 76 | 
 77 | def dollar_match(predicted, actual):
 78 |     """Best-effort matching of dollar amounts, e.g. '$14,123.02' to '14123.02'."""
 79 |     return (
 80 |         is_dollar_amount(predicted)
 81 |         and is_dollar_amount(actual)
 82 |         and (normalize_dollars(predicted) == normalize_dollars(actual))
 83 |     )
 84 | 
 85 | 
 86 | date_formats = {
 87 |     # If a string matches the regex key, it can be passed to strptime()
 88 |     # with the respective format string. Ordered from most to least common.
 89 |     re.compile(r"^[01]?\d/[0123]?\d/\d\d$"): "%m/%d/%y",
 90 |     re.compile(r"^[01]?\d/[0123]?\d/20\d\d$"): "%m/%d/%Y",
 91 |     re.compile(r"^[a-z]{3}\d?\d/\d\d$"): "%b%d/%y",
 92 |     re.compile(r"^[a-z]{3}\d?\d/20\d\d$"): "%b%d/%Y",
 93 |     re.compile(r"^[a-z]{4,9}\d?\d/\d\d$"): "%B%d/%y",
 94 |     re.compile(r"^[a-z]{4,9}\d?\d/20\d\d$"): "%B%d/%Y",
 95 | }
 96 | _time_punc = re.compile(r"[-,\\]")
 97 | _no_year = re.compile(r"^[01]?\d/[0123]?\d$")
 98 | 
 99 | 
100 | def normalize_date(s):
101 |     """Turn a string in a common date format into a date."""
102 |     try:
103 |         if num_digits(s) == 0:
104 |             return None
105 |         # Turn dashes, commas and back slashes into forward slashes.
106 |         s = _time_punc.sub("/", simple_string(s))
107 |         # Check the string against each possible date format.
108 |         for date_regex, strp_format in date_formats.items():
109 |             if date_regex.match(s):
110 |                 return datetime.strptime(s, strp_format).date()
111 |         if _no_year.match(s):
112 |             # If no date is present, assume 2020.
113 |             return datetime.strptime(s + "/20", "%m/%d/%y").date()
114 |     except (TypeError, ValueError):
115 |         return None
116 | 
117 | 
118 | def date_similarity(lhs, rhs):
119 |     lh_date, rh_date = normalize_date(lhs), normalize_date(rhs)
120 |     if lh_date and rh_date and lh_date == rh_date:
121 |         return 1
122 |     return default_similarity(lhs, rhs)
123 | 
124 | 
125 | def date_match(predicted, actual):
126 |     """Best-effort matching of dates, e.g. '02-03-2020' to '2/3/20'."""
127 |     lhs, rhs = normalize_date(predicted), normalize_date(actual)
128 |     return bool(lhs and rhs and lhs == rhs)
129 | 
130 | 
131 | def any_similarity(lhs, rhs):
132 |     return max(dollar_similarity(lhs, rhs), date_similarity(lhs, rhs))
133 | 
134 | 
135 | def any_match(lhs, rhs):
136 |     return loose_match(lhs, rhs) or dollar_match(lhs, rhs) or date_match(lhs, rhs)
137 | 
138 | 
139 | def docrow_to_bbox(t, min_height=10):
140 |     """Create the array pdfplumber expects for bounding boxes from an input namedtuple.
141 | 
142 |     If `min_height` is set, adjust the minimum size of the bounding boxes to fix the
143 |     cases where pdfplumber has incorrectly underlined rather than boxed in the
144 |     recognized text.
145 |     """
146 |     dims = {k: Decimal(float(getattr(t, k))) for k in ["x0", "y0", "x1", "y1"]}
147 |     if min_height:
148 |         dims["y0"] = min(dims["y1"] - Decimal(min_height), dims["y0"])
149 |     return BoundingBox(**dims)
150 | 
151 | 
152 | def wandb_bbox(t, score, class_id, pdf_page, min_height=10):
153 |     """Prototype logging bounding boxes to W&B. Currently W&B assumes a fixed
154 |     single size for each image logged, so this requires resizing all logged documents
155 |     to see correct bounding boxes"""
156 |     dims = docrow_to_bbox(t, min_height)
157 | 
158 |     # reproject bounding box into pdf image
159 |     x0, y0 = pdf_page._reproject((dims.x0, dims.y0))
160 |     x1, y1 = pdf_page._reproject((dims.x1, dims.y1))
161 | 
162 |     box_data = {
163 |         "position": {
164 |             "minX": float(x0),
165 |             "minY": float(y0),
166 |             "maxX": float(x1),
167 |             "maxY": float(y1),
168 |         },
169 |         "class_id": class_id,
170 |         "domain": "pixel",
171 |         "scores": {"score": score},
172 |         "box_caption": "%.3f" % score,
173 |     }
174 |     return box_data
175 | 
176 | 
177 | def config_desc(config):
178 |     """A one-line text string describing the configuration of a run."""
179 |     return (
180 |         "len:{len_train} "
181 |         "win:{window_len} "
182 |         "str:{use_string} "
183 |         "page:{use_page} "
184 |         "geom:{use_geom} "
185 |         "amt:{use_amount} "
186 |         "voc:{vocab_size} "
187 |         "emb:{vocab_embed_size} "
188 |         "steps:{steps_per_epoch}"
189 |     ).format(**config)
190 | 
191 | 
192 | def sample(items, n=None, seed=None):
193 |     """Get a sample of `n` items without replacement.
194 | 
195 |     If n is None, return the input after shuffling it.
196 |     """
197 |     if seed:
198 |         random.seed(seed)
199 |     if n is None:
200 |         n = len(items)
201 |     return random.sample(items, k=n)
202 | 
203 | 
204 | def git_short_hash():
205 |     try:
206 |         out = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
207 |         return out.strip().decode("ascii")
208 |     except (OSError, subprocess.CalledProcessError):
209 |         return "UnknownGitRevsion"
210 | 
211 | 
212 | def pad_sparse_matrix(m, pad_rows=0, pad_columns=0):
213 |     (rows, _) = m.get_shape()
214 |     column_padding = sparse.coo_matrix((rows, pad_columns))
215 |     padded_columns = sparse.hstack([column_padding, m, column_padding])
216 |     (_, columns) = padded_columns.get_shape()
217 |     row_padding = sparse.coo_matrix((pad_rows, columns))
218 |     padded_rows = sparse.vstack([row_padding, padded_columns, row_padding])
219 |     return padded_rows.tocoo()
220 | 


--------------------------------------------------------------------------------
/init_sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Source this script in order to invoke wandb sweep sweep.yaml and set the var WANDB_SWEEP_ID
 3 | 
 4 | export SED_REGEX_EXTRACT='s/^.*Created sweep with ID: \([[:alnum:]]*\).*$/\1/p'
 5 | init=$(wandb sweep sweep.yaml 2>&1 | sed -n "$SED_REGEX_EXTRACT")
 6 | 
 7 | if [ -z "$init" ]
 8 | then
 9 |   exit 1
10 | else
11 |   echo $init
12 |   export WANDB_SWEEP_ID="$init"
13 |   wandb agent deepform/deepform/$WANDB_SWEEP_ID
14 | fi
15 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "deepform"
 3 | version = "0.1.1"
 4 | description = "extract information from TV station political advertising disclosure forms"
 5 | authors = ["Jonathan Stray <jonathanstray@gmail.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | boto3 = "^1.14.39"
 9 | fuzzywuzzy = {extras = ["speedup"], version = "^0.18.0"}
10 | humanize = "^3.0.0"
11 | joblib = "^0.16.0"
12 | keras = "^2.4.3"
13 | numpy = "^1.18.5"
14 | pandas = "^1.1.2"
15 | pdfplumber = "^0.5.23"
16 | pyarrow = "^1.0.1"
17 | python = "^3.8.1"
18 | sqlalchemy = "^1.3.18"
19 | tensorflow = "^2.3.1"
20 | tqdm = "^4.48.2"
21 | wandb = "0.10.4"
22 | spektral = "^0.6.2"
23 | 
24 | [tool.poetry.dev-dependencies]
25 | autoflake = "^1.3.1"
26 | babel = "^2.8.0"
27 | black = "^20.8b1"
28 | faker = "^4.1.1"
29 | flake8 = "^3.8.3"
30 | hypothesis = "^5.24.0"
31 | isort = "^5.5.4"
32 | matplotlib = "^3.3.0"
33 | pre-commit = "^2.6.0"
34 | pylint = "^2.5.3"
35 | pytest = "^6.1.0"
36 | jupyterlab = "^2.2.8"
37 | 
38 | [tool.isort]
39 | profile = "black"
40 | src_paths = ["deepform", "test"]
41 | known_third_party = ["boto3", "botocore", "fuzzywuzzy", "joblib", "keras", "nltk", "numpy", "pandas", "pdfplumber", "pyarrow", "sqlalchemy", "tensorflow", "tqdm", "wandb"]
42 | 
43 | [tool.black]
44 | line-length = 88
45 | target-version = ['py38']
46 | 
47 | [build-system]
48 | requires = ["poetry-core>=1.0.0"]
49 | build-backend = "poetry.core.masonry.api"
50 | 


--------------------------------------------------------------------------------
/source/README.md:
--------------------------------------------------------------------------------
1 | ftf-all-filings.tsv is the crowdsourced data entered by volunteers in 2012.
2 | The dc-slug is can be used to get a URL for the original PDF; see download-pdfs.py
3 | 
4 | Originally from https://www.propublica.org/datastore/dataset/free-the-files-filing-data
5 | 


--------------------------------------------------------------------------------
/sweep.yaml:
--------------------------------------------------------------------------------
 1 | # sweep.yaml
 2 | program: deepform/train.py
 3 | project: deepform
 4 | method: bayes
 5 | metric:
 6 |  name: doc_val_acc #acc #val_acc
 7 |  goal: maximize
 8 | parameters:
 9 |  steps_per_epoch:
10 |    values: [10, 25, 50]
11 |    #max: 50
12 |  use_string:
13 |    values: [0, 1]
14 |  use_page:
15 |    values: [0, 1]
16 |  use_geom:
17 |    values: [0, 1]
18 |  use_amount:
19 |    values: [0, 1]
20 |  window_len:
21 |    values: [10, 25, 50]
22 |  epochs:
23 |    values: [50, 100]
24 |  steps_per_epoch:
25 |    values: [10, 25, 50]
26 |  len_train:
27 |    values: [100, 200]
28 |  vocab_size:
29 |    values: [100, 500, 2000]
30 |  vocab_embed_size:
31 |    values: [16, 32, 64]
32 | early_terminate:
33 |    #min_iter #specify the iteration for the first bracket
34 |    type: hyperband
35 |    s: 2 #specify the total number of brackets (required for max_iter)
36 |    eta: 3 #specify the bracket multiplier schedule (default: 3)
37 |    max_iter: 27 #specify the maximum number of iterations for the program
38 | 


--------------------------------------------------------------------------------
/tests/test_add_features.py:
--------------------------------------------------------------------------------
  1 | from datetime import date, timedelta
  2 | 
  3 | import pandas as pd
  4 | from babel.numbers import format_currency
  5 | 
  6 | from deepform.data.add_features import (
  7 |     extend_and_write_docs,
  8 |     fraction_digits,
  9 |     pq_index_and_dir,
 10 | )
 11 | from deepform.data.create_vocabulary import get_token_id
 12 | from deepform.util import is_dollar_amount, log_dollar_amount
 13 | 
 14 | COL_TYPES = {
 15 |     "page": "f4",  # 32-bit float.
 16 |     "x0": "f4",
 17 |     "y0": "f4",
 18 |     "x1": "f4",
 19 |     "y1": "f4",
 20 |     "token": "string",  # Pandas 1.x string type.
 21 | }
 22 | 
 23 | 
 24 | def random_dollar_amount(faker):
 25 |     amount = round(faker.pyfloat(min_value=0, max_value=100000), 2)
 26 |     return format_currency(amount, "USD", locale="en_US")
 27 | 
 28 | 
 29 | def random_date(faker, start_date=date(2020, 1, 1), end_date=date(2020, 12, 31)):
 30 |     days = (end_date - start_date).days
 31 |     day = faker.pyint(min_value=0, max_value=days)
 32 |     return start_date + timedelta(days=day)
 33 | 
 34 | 
 35 | def random_training_data_row(faker):
 36 |     x0 = faker.pyfloat(min_value=-1, max_value=600)
 37 |     y0 = faker.pyfloat(min_value=-1, max_value=750)
 38 |     return {
 39 |         "page": faker.pyfloat(min_value=0, max_value=1),
 40 |         "x0": x0,
 41 |         "y0": y0,
 42 |         "x1": x0 + faker.pyfloat(min_value=-1, max_value=20),
 43 |         "y1": y0 + faker.pyfloat(min_value=-1, max_value=50),
 44 |         "token": faker.pystr(min_chars=1, max_chars=50),
 45 |     }
 46 | 
 47 | 
 48 | def random_doc_data(faker):
 49 |     num_tokens = faker.pyint(min_value=1, max_value=500)
 50 |     df = pd.DataFrame([random_training_data_row(faker) for _ in range(num_tokens)])
 51 |     return df.astype(COL_TYPES)
 52 | 
 53 | 
 54 | def create_tokens_and_manifest(faker, src_path, manifest_path, num_docs=5):
 55 |     src_path.mkdir(parents=True, exist_ok=True)
 56 | 
 57 |     docs = {faker.slug(): random_doc_data(faker) for _ in range(num_docs)}
 58 |     manifest = []
 59 | 
 60 |     for slug, doc in docs.items():
 61 |         doc.to_parquet(src_path / f"{slug}.parquet", index=False)
 62 |         manifest.append(
 63 |             {
 64 |                 "file_id": slug,
 65 |                 "contract_num": faker.isbn10(),
 66 |                 "advertiser": faker.company(),
 67 |                 "flight_from": random_date(faker),
 68 |                 "flight_to": random_date(faker),
 69 |                 "gross_amount": random_dollar_amount(faker),
 70 |             }
 71 |         )
 72 | 
 73 |     return pd.DataFrame(manifest)
 74 | 
 75 | 
 76 | def test_add_features_to_labeled_parquet(faker, tmp_path):
 77 |     num_docs = 5
 78 |     src_path = tmp_path / "tokenized"
 79 |     manifest = create_tokens_and_manifest(faker, src_path, num_docs)
 80 | 
 81 |     idx_path = tmp_path / "doc_index.parquet"
 82 |     idx_path, pq_path = pq_index_and_dir(idx_path)
 83 | 
 84 |     # Run the conversion code.
 85 |     extend_and_write_docs(src_path, manifest, idx_path, pq_path, 1)
 86 | 
 87 |     # Check out the index.
 88 |     index = pd.read_parquet(idx_path)
 89 | 
 90 |     assert len(index) == num_docs
 91 |     assert set(manifest.file_id) == set(index.index)
 92 | 
 93 |     # Check out each individual document that was produced.
 94 |     for row in index.itertuples():
 95 |         doc = pd.read_parquet(pq_path / f"{row.Index}.parquet")
 96 |         # Doc features
 97 |         assert doc.token.str.len().min() >= 3
 98 |         assert row.length == len(doc)
 99 |         assert row.best_match_gross_amount == doc.gross_amount.max()
100 |         assert row.best_match_contract_num == doc.contract_num.max()
101 | 
102 |         # Row features
103 |         assert (doc.tok_id == doc.token.apply(get_token_id)).all()
104 |         assert (doc.length == doc.token.str.len()).all()
105 |         assert (doc.digitness == doc.token.apply(fraction_digits)).all()
106 |         assert (doc.is_dollar == doc.token.apply(is_dollar_amount)).all()
107 |         assert (doc.log_amount == doc.token.apply(log_dollar_amount).fillna(0)).all()
108 | 


--------------------------------------------------------------------------------
/tests/test_graph_geometry.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from deepform.data.graph_geometry import document_edges
 5 | 
 6 | # ASCII Art of Test Example
 7 | #
 8 | # A --- B --- C
 9 | # |  W--|--X  |
10 | # D -|- E -|- F
11 | # |  Y -|- Z  |
12 | # G --- H --- I
13 | 
14 | A = {"token": "A", "x0": 1, "y1": 1, "page": 0}
15 | B = {"token": "B", "x0": 3, "y1": 1, "page": 0}
16 | C = {"token": "C", "x0": 5, "y1": 1, "page": 0}
17 | D = {"token": "D", "x0": 1, "y1": 3, "page": 0}
18 | E = {"token": "E", "x0": 3, "y1": 3, "page": 0}
19 | F = {"token": "F", "x0": 5, "y1": 3, "page": 0}
20 | G = {"token": "G", "x0": 1, "y1": 5, "page": 0}
21 | H = {"token": "H", "x0": 3, "y1": 5, "page": 0}
22 | I = {"token": "I", "x0": 5, "y1": 5, "page": 0}  # noqa: E741
23 | W = {"token": "W", "x0": 2, "y1": 2, "page": 0}
24 | X = {"token": "X", "x0": 4, "y1": 2, "page": 0}
25 | Y = {"token": "Y", "x0": 2, "y1": 4, "page": 0}
26 | Z = {"token": "Z", "x0": 4, "y1": 4, "page": 0}
27 | 
28 | tokens = pd.DataFrame.from_records([A, B, C, D, E, F, G, H, I, W, X, Y, Z])
29 | 
30 | # Manually construct the sparse matrix of edges for the above example.
31 | edges = np.zeros((13, 13))
32 | 
33 | edges[0, 1] = True  # A B
34 | edges[1, 2] = True  # B C
35 | edges[3, 4] = True  # D E
36 | edges[4, 5] = True  # E F
37 | edges[6, 7] = True  # G H
38 | edges[7, 8] = True  # H I
39 | edges[0, 3] = True  # A D
40 | edges[3, 6] = True  # D G
41 | edges[1, 4] = True  # B E
42 | edges[4, 7] = True  # E H
43 | edges[2, 5] = True  # C F
44 | edges[5, 8] = True  # F I
45 | edges[9, 10] = True  # W X
46 | edges[11, 12] = True  # Y Z
47 | edges[9, 11] = True  # W Y
48 | edges[10, 12] = True  # X Z
49 | 
50 | # Add in the symmetric relationships
51 | edges = edges + edges.T
52 | 
53 | adjacency = document_edges(tokens).todense()
54 | expected = edges
55 | 
56 | 
57 | def test_9x9_adjacency():
58 |     adjacency9x9 = adjacency[0:9, 0:9]
59 |     expected9x9 = expected[0:9, 0:9]
60 |     assert (adjacency9x9 == expected9x9).all()
61 | 
62 | 
63 | def test_4x4_adjacency():
64 |     adjacency4x4 = adjacency[9:, 9:]
65 |     expected4x4 = expected[9:, 9:]
66 |     assert (adjacency4x4 == expected4x4).all()
67 | 
68 | 
69 | def test_disconnected():
70 |     disconnectedRight = adjacency[9:, 0:9]
71 |     disconnectedBottom = adjacency[0:9, 9:]
72 |     assert (disconnectedRight == 0).all()
73 |     assert (disconnectedBottom == 0).all()
74 | 
75 | 
76 | def test_different_pages():
77 |     B_pg_2 = B.copy()
78 |     B_pg_2["page"] = 1
79 |     tokens_pages = pd.DataFrame.from_records([A, B_pg_2, C])
80 | 
81 |     adjacency = document_edges(tokens_pages).todense()
82 |     assert not adjacency[0, 1]
83 |     assert adjacency[0, 2]
84 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | from datetime import date
  2 | from decimal import Decimal
  3 | from math import isclose
  4 | 
  5 | import hypothesis.strategies as st
  6 | import scipy.sparse as sparse
  7 | from hypothesis import example, given
  8 | 
  9 | from deepform.util import (
 10 |     BoundingBox,
 11 |     docrow_to_bbox,
 12 |     dollar_amount,
 13 |     is_dollar_amount,
 14 |     log_dollar_amount,
 15 |     normalize_date,
 16 |     normalize_dollars,
 17 |     pad_sparse_matrix,
 18 | )
 19 | 
 20 | 
 21 | def test_is_dollar_amount():
 22 |     assert is_dollar_amount("$10")
 23 |     assert is_dollar_amount("$15.00")
 24 |     assert is_dollar_amount("$2.03")
 25 |     assert is_dollar_amount("3")
 26 |     assert is_dollar_amount("04")
 27 |     assert is_dollar_amount("9,000")
 28 |     assert not is_dollar_amount("")
 29 |     assert not is_dollar_amount("$")
 30 |     assert not is_dollar_amount(",")
 31 |     assert not is_dollar_amount(".")
 32 |     assert not is_dollar_amount("$,")
 33 |     assert not is_dollar_amount("$.")
 34 |     assert not is_dollar_amount("C")
 35 |     assert not is_dollar_amount("$x")
 36 |     assert not is_dollar_amount("3 .17")
 37 | 
 38 | 
 39 | def test_dollar_amount():
 40 |     assert dollar_amount("$10") == 10
 41 |     assert dollar_amount("$15.00") == 15
 42 |     assert dollar_amount("$2.03") == 2.03
 43 |     assert dollar_amount("3") == 3
 44 |     assert dollar_amount("04") == 4
 45 |     assert dollar_amount("9,000") == 9000
 46 |     assert dollar_amount("") is None
 47 |     assert dollar_amount("C") is None
 48 |     assert dollar_amount("$x") is None
 49 |     assert dollar_amount("3 .17") is None
 50 | 
 51 | 
 52 | @given(st.text())
 53 | @example("$.01")
 54 | @example("$6.010.01")
 55 | @example("$3,020,01")
 56 | def test_dollar_amount_accepts_arbitratry_strings(s):
 57 |     if not is_dollar_amount(s):
 58 |         assert dollar_amount(s) is None
 59 |     else:
 60 |         assert normalize_dollars(s) is not None
 61 |         n = dollar_amount(s)
 62 |         assert normalize_dollars(str(n)) == normalize_dollars(s)
 63 | 
 64 | 
 65 | @given(st.text())
 66 | @example("0.02")
 67 | @example("-1")
 68 | @example("$-0.5")
 69 | def test_log_dollar_amount_accepts_arbitratry_strings(s):
 70 |     if is_dollar_amount(s) and dollar_amount(s) > 0:
 71 |         assert log_dollar_amount(s) > 0
 72 |     else:
 73 |         assert log_dollar_amount(s) is None
 74 | 
 75 | 
 76 | def test_normalize_dollars():
 77 |     assert normalize_dollars("0") == "0.00"
 78 |     assert normalize_dollars("$10") == "10.00"
 79 |     assert normalize_dollars("$15.00") == "15.00"
 80 |     assert normalize_dollars("$2.03") == "2.03"
 81 |     assert normalize_dollars("3") == "3.00"
 82 |     assert normalize_dollars("04") == "4.00"
 83 |     assert normalize_dollars("9,000") == "9000.00"
 84 |     assert normalize_dollars("") is None
 85 |     assert normalize_dollars("C") is None
 86 |     assert normalize_dollars("$x") is None
 87 |     assert normalize_dollars("3 .17") is None
 88 | 
 89 | 
 90 | def test_normalize_date():
 91 |     assert normalize_date("03/12/20") == date(2020, 3, 12)
 92 |     assert normalize_date("3/4/19") == date(2019, 3, 4)
 93 |     assert normalize_date("6-1") == date(2020, 6, 1)
 94 |     assert normalize_date("4-28-21") == date(2021, 4, 28)
 95 |     assert normalize_date("Apr16/20") == date(2020, 4, 16)
 96 |     assert normalize_date("DEC30/19") == date(2019, 12, 30)
 97 |     assert normalize_date("February 12, 2020") == date(2020, 2, 12)
 98 |     assert normalize_date("11/20") == date(2020, 11, 20)
 99 |     assert normalize_date("22") is None
100 |     assert normalize_date("") is None
101 |     assert normalize_date(None) is None
102 | 
103 | 
104 | coord = st.floats(min_value=-10, max_value=800, allow_nan=False)
105 | height = st.floats(min_value=0, max_value=100)
106 | 
107 | 
108 | @given(x0=coord, y0=coord, x1=coord, y1=coord, mh=height)
109 | def test_docrow_to_bbox(x0, y0, x1, y1, mh):
110 |     t = BoundingBox(x0=x0, x1=x1, y0=y0, y1=y1)
111 |     bbox0 = docrow_to_bbox(t, min_height=None)
112 |     bbox1 = docrow_to_bbox(t)
113 |     bbox2 = docrow_to_bbox(t, min_height=mh)
114 |     for box in (bbox0, bbox1, bbox2):
115 |         assert box.x0 == Decimal(x0)
116 |         assert box.x1 == Decimal(x1)
117 |         assert box.y1 == Decimal(y1)
118 |     assert bbox0.y0 == Decimal(y0)
119 |     # Floating point arithmetic, yo.
120 |     assert bbox1.y1 - bbox1.y0 >= 10 or isclose(bbox1.y1 - bbox1.y0, 10)
121 |     assert bbox2.y1 - bbox2.y0 >= mh or isclose(bbox2.y1 - bbox2.y0, mh)
122 | 
123 | 
124 | def test_sparse_padding():
125 |     m = sparse.identity(3)
126 |     padded = pad_sparse_matrix(m, 1, 1).todense()
127 |     assert padded.shape == (5, 5)
128 |     assert padded[0, 0] == 0
129 |     assert padded[1, 1] == 1
130 | 


--------------------------------------------------------------------------------