├── .dockerignore ├── .editorconfig ├── .env.example ├── .gitattributes ├── .github ├── FUNDING.yml └── workflows │ ├── deploy.yml │ ├── lint.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── DEVELOPING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── compose.yaml ├── docker ├── Dockerfile └── Makefile ├── doctor ├── __init__.py ├── assets │ └── producer-300x300.png ├── forms.py ├── lib │ ├── __init__.py │ ├── mojibake.py │ ├── text_extraction.py │ └── utils.py ├── settings.py ├── tasks.py ├── test_assets │ ├── 1.mp3 │ ├── 1.wma │ ├── 1_with_metadata.mp3 │ ├── ander_v._leo.mp3 │ ├── broken-mime.pdf │ ├── empty.pdf │ ├── image-pdf-2-thumbnail.png │ ├── image-pdf-2.pdf │ ├── image-pdf-thumbnail.png │ ├── image-pdf.pdf │ ├── long-image.tiff │ ├── missouri.pdf │ ├── ocr_pdf_variation.pdf │ ├── recap_documents │ │ ├── ca10_010110462922.pdf │ │ ├── ca1_00117684624.pdf │ │ ├── ca2_1-1.pdf │ │ ├── ca3_003112692106.pdf │ │ ├── ca4_17.pdf │ │ ├── ca5_00516242060.pdf │ │ ├── ca6_1-3.pdf │ │ ├── ca7_3.pdf │ │ ├── ca8_.pdf │ │ ├── ca9_19.pdf │ │ └── cafc_3.pdf │ ├── recap_extract │ │ ├── gov.uscourts.azd.1085839.3.0.pdf │ │ ├── gov.uscourts.cacd.652774.40.0.pdf │ │ └── gov.uscourts.cand.203070.27.0.pdf │ ├── vector-pdf.pdf │ ├── word-doc.doc │ ├── word-docx.docx │ ├── word-perfect.wpd │ └── x-ray │ │ ├── rectangles_no.pdf │ │ ├── rectangles_yes.pdf │ │ └── rectangles_yes_2.pdf ├── tests.py ├── urls.py ├── views.py └── wsgi.py ├── manage.py ├── pyproject.toml └── uv.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .venv 3 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | indent_style = space 8 | indent_size = 4 9 | end_of_line = lf 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | 13 | [*.py] 14 | max_line_length = 79 15 | 16 | [*.{js,html,json,css,yml,yaml}] 17 | indent_size = 2 18 | 19 | [*.md] 20 | trim_trailing_whitespace = false 21 | 22 | # The JSON files contain newlines inconsistently 23 | [*.json] 24 | insert_final_newline = ignore 25 | 26 | # Minified JavaScript files shouldn't be changed 27 | [**.min.js] 28 | indent_style = ignore 29 | insert_final_newline = ignore 30 | 31 | # Makefiles always use tabs for indentation 32 | [Makefile] 33 | indent_style = tab 34 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DEBUG=on 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Prevent Windows git clients for goofing up files that must run on Linux 2 | * text eol=lf 3 | 4 | # Image files 5 | *.png binary 6 | *.jpg binary 7 | *.jpeg binary 8 | *.gif binary 9 | *.psd binary 10 | 11 | # Audio files 12 | *.wma binary 13 | *.mp3 binary 14 | 15 | # Compressed files 16 | *.jar binary 17 | *.exe binary 18 | *.bz2 binary 19 | *.gz binary 20 | *.zip binary 21 | 22 | # Fonts 23 | *.eot binary 24 | *.otf binary 25 | *.ttf binary 26 | *.woff binary 27 | *.woff2 binary 28 | 29 | # File formats 30 | *.ods binary 31 | *.pdf binary 32 | *.xls binary 33 | *.wpd binary 34 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: freelawproject 4 | custom: https://www.courtlistener.com/donate/?referrer=github-courtlistener 5 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Automate build and deploy 2 | on: 3 | pull_request: 4 | branches: [ "main" ] 5 | types: 6 | - closed 7 | 8 | env: 9 | AWS_REGION: us-west-2 10 | EKS_CLUSTER_NAME: courtlistener 11 | EKS_NAMESPACE: court-listener 12 | 13 | jobs: 14 | build: 15 | # Build only merged PRs 16 | if: (github.event_name == 'pull_request' && github.event.pull_request.merged == true) 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Login to Docker Hub 21 | uses: docker/login-action@v3 22 | with: 23 | username: ${{ secrets.DOCKERHUB_USERNAME }} 24 | password: ${{ secrets.DOCKERHUB_TOKEN }} 25 | - name: Build and Push 26 | run: | 27 | make push --file docker/Makefile -e VERSION=$(git rev-parse --short HEAD) 28 | 29 | deploy: 30 | needs: build 31 | runs-on: ubuntu-latest 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Set shortcode 35 | id: vars 36 | run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" 37 | - name: Configure AWS credentials 38 | uses: aws-actions/configure-aws-credentials@v4 39 | with: 40 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 41 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 42 | aws-region: ${{ env.AWS_REGION }} 43 | - name: Create Kubeconfig with AWS CLI 44 | run: aws eks update-kubeconfig --region ${{ env.AWS_REGION }} --name ${{ env.EKS_CLUSTER_NAME }} 45 | - name: Rollout cl-doctor 46 | run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-doctor doctor=freelawproject/doctor:${{ steps.vars.outputs.sha_short }} 47 | - name: Watch cl-doctor rollout status 48 | run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-doctor 49 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | pre-commit: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-python@v5 15 | with: 16 | python-version: "3.10" 17 | - uses: pre-commit/action@v3.0.1 18 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Create the .env settings file 16 | run: cp .env.example .env.dev 17 | 18 | - name: Update .env.dev file 19 | run: | 20 | echo 'DEBUG=on' >> .env.dev 21 | 22 | - name: Build Image 23 | run: docker compose up --build -d 24 | 25 | - name: Run tests 26 | run: docker compose exec doctor python -m unittest 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pipenv 85 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 86 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 87 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 88 | # install all needed dependencies. 89 | #Pipfile.lock 90 | 91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 92 | __pypackages__/ 93 | 94 | # Celery stuff 95 | celerybeat-schedule 96 | celerybeat.pid 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # Ignore JetBrains files 128 | .idea 129 | 130 | # Env file 131 | .env.dev 132 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: migrations 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.5.0 7 | hooks: 8 | - id: check-added-large-files 9 | - id: check-ast 10 | - id: check-json 11 | - id: check-merge-conflict 12 | - id: check-toml 13 | - id: check-xml 14 | - id: check-yaml 15 | - id: debug-statements 16 | - id: detect-private-key 17 | - id: fix-byte-order-marker 18 | - id: fix-encoding-pragma 19 | args: [--remove] 20 | - id: trailing-whitespace 21 | args: [--markdown-linebreak-ext=md] 22 | 23 | - repo: https://github.com/astral-sh/ruff-pre-commit 24 | rev: v0.11.8 25 | hooks: 26 | - id: ruff 27 | args: [ --fix ] 28 | - id: ruff-format 29 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Current 2 | 3 | **0.3.1 - 2023-01-17** 4 | 5 | Features: 6 | - Adds /convert/pdf/thumbnails/ service that returns a zip file with thumbnails from a PDF document. 7 | 8 | ## Previous Versions 9 | 10 | **0.3.0 - 2022-09-30** 11 | 12 | Features: 13 | - Code cleanup and reformatting. 14 | - Documentation enhancements. 15 | - Removal of dead code. 16 | 17 | Changes: 18 | - Removes a number of URLs that were duplicative: 19 | - `/extract/pdf/text/`: Use `/extract/doc/text/` instead. They ran the same code under the covers, but their returns are slightly different. Generally, if you used something like `response.text` before, you should use `response.json()["content"]` now. 20 | - `/utils/file/mime/`: Use `/utils/mime-type/` instead. If you used `response.json()["mime"]` before, use `response.json()["mimetype"]` now. 21 | - These endpoints were never documented: 22 | - `/text/`, `/document/pdf-to-text/`, and `/extract-doc-content/`: Use `/extract/doc/text/` instead. 23 | - `/pg-count/` and `/document/page_count/`: Use `/utils/page-count/pdf/` instead. 24 | - `/mime-type/`: Use `/utils/mime-type/` instead. 25 | - `/image-to-pdf/`: Use `/convert/image/pdf/` instead. 26 | - `/images-to-pdf/`: Use `/convert/images/pdf/` instead. 27 | - `/thumbnail/`: Use `/convert/pdf/thumbnail/` instead. 28 | - `/convert-audio/`: Use `/convert/audio/mp3/` instead. 29 | - `/document/thumbnail/`: Use `/convert/pdf/thumbnail/` instead. 30 | - Tweaks the tests to use new container names that are less likely to conflict with existing containers. 31 | 32 | **0.2.16 - 2022-09-28** 33 | 34 | Features: 35 | - Adds /utils/document-number/pdf/ service that returns the PACER document number from a RECAP PDF document. 36 | 37 | **0.2.15 - 2022-07-27** 38 | 39 | Fixes: 40 | - Adds PyCryptodome in order to handle encrypted PDFs ([144](https://github.com/freelawproject/doctor/issues/144)) 41 | 42 | **0.2.14 - 2022-07-26** 43 | 44 | Features: 45 | - Adds sentry integration 46 | - Adds django-environ to allow environment variables for Django settings 47 | 48 | **0.2.13 - 2022-06-02** 49 | 50 | This release is focused on performance improvements and easier scaling. It: 51 | 52 | - Disables multi-threaded tesseract code. This makes it easier to scale doctor in a k8s environment due to at most one CPU being used per conversion. 53 | - Sets the number of gunicorn workers to 1 by default. This makes it so that scaling is can be moved to k8s instead of gunicorn. 54 | - Tells tesseract not to look for white text on black backgrounds. This is just a simple performance tweak. 55 | - Upgrades to PyPDF2 version 2.0.0. 56 | 57 | **0.2.12 - 2022-05-19** 58 | 59 | Features: 60 | - Add an even better encoding for extract_from_html 61 | 62 | **0.2.11 - 2022-05-12** 63 | 64 | Features: 65 | - Add even better encoding for extract_from_html 66 | - Add better error message 67 | 68 | **0.2.10 - 2022-05-02** 69 | 70 | Features: 71 | - Adds better encoding for extract_from_html 72 | - Bump seal-rookery to 2.2.1 73 | - Update seal-rookery call 74 | 75 | **0.2.9 - 2022-04-19** 76 | 77 | Features: 78 | - Fix for mime type detection for weird PDF failures 79 | - Test for broken PDFs 80 | 81 | **0.2.8 - 2022-04-14** 82 | 83 | Features: 84 | - Drop m1 specific docker builds. 85 | - Return 406's when validation of forms fails 86 | - Add tests for incomplete post requests to the server. 87 | - Reduce build installs and build install time. 88 | 89 | **0.2.7 - 2022-04-12** 90 | 91 | Features: 92 | - Bump seal-rookey to speed up builds. 93 | - Add m1 build in Makefile. 94 | 95 | **0.2.6 - 2022-04-12** 96 | 97 | Fixes: 98 | - Add additional workers and worker resets to the gunicorn configuration. The 99 | default is now four workers, and additional ones can be created with the 100 | DOCTOR_WORKERS env. 101 | 102 | **0.2.5 - 2022-03-24** 103 | 104 | Features: 105 | - Add two new endpoints 106 | - Extensions from blob 107 | - Mime type from blob 108 | 109 | Changes: 110 | - Drop NGINX 111 | - Combine installation 112 | 113 | 114 | **0.2.4 - 2022-03-23** 115 | 116 | Features: 117 | - Refactor document/extract/ endpoint to return json and drop cookies 118 | 119 | Changes: 120 | - Fix dockerfile update-seals 121 | - Drop cookie support and use JSON responses when necessary 122 | - Update tests 123 | - Update heartbeat to match disclosure endpoint 124 | 125 | **0.2.3 - 2022-03-22** 126 | 127 | Features: 128 | - Update type of response object 129 | - Drop json response success = False if invalid form and just return Bad Request 130 | 131 | Changes: 132 | 133 | 134 | **0.2.2 - 2022-03-21** 135 | 136 | Features: 137 | - Split audio conversion into two steps: first convert to mp3 138 | and a second method to fetch audio duration.. 139 | 140 | Changes: 141 | - Update readme. 142 | - Bump version to 0.2.2 143 | - Update tests for new endpoint. 144 | 145 | 146 | **0.2.1 - 2022-03-18** 147 | 148 | Features: 149 | - Update nginx config for longer timeouts 150 | 151 | Changes: 152 | - Update nginx config for longer timeouts 153 | - Bump python version for linting 154 | - Fix typo in DEVELOPING.md 155 | 156 | **0.2.0 - 2022-03-16** 157 | 158 | Features: 159 | - Greatly improved documentation 160 | - Improved speed 161 | 162 | Changes: 163 | - Overhauled the entire codebase 164 | - Dropped seal-rookery image 165 | - Switched to Django and gunicorn from uWSGI and Flask 166 | - Completed api tests 167 | - Added Makefile for building and pushing 168 | - Updated NGINX config 169 | - Added DEVELOPING.md 170 | - Added composefile for testing with or without docker networking 171 | - Removed financial disclosures (coming soon as a separate project). 172 | - General improvements and cleanup. 173 | - Add support for multiple architectures. (linux/amd64,linux/arm64) 174 | - Added changelog 175 | 176 | 177 | **0.1.0 - 2021-11-08** 178 | 179 | 180 | **0.0.36 - 2021-05-11** 181 | 182 | 183 | **0.0.36 - 2021-03-17** 184 | -------------------------------------------------------------------------------- /DEVELOPING.md: -------------------------------------------------------------------------------- 1 | This is a microservice, so tests are designed to be run from a mock web 2 | application that calls to this service. 3 | 4 | ## Quick start 5 | 6 | To build the microservice and start it up, run: 7 | 8 | docker compose up --build -d 9 | 10 | To see logs: 11 | 12 | docker compose logs -f 13 | 14 | If you want to see debug logs, set `DEBUG` to `True` in `settings.py`. 15 | 16 | 17 | ## Testing 18 | 19 | Once the above compose file is running, you can use the `mock_web_app` 20 | container to run the tests against the `doctor` container: 21 | 22 | docker exec -it mock_web_app python3 -m unittest doctor.tests 23 | 24 | 25 | ## Building Images 26 | 27 | Generally, images are automatically built and pushed to the docker repo when 28 | PRs are merged. If it needs to happen manually, try this: 29 | 30 | `make image --file docker/Makefile` 31 | 32 | And pushed with: 33 | 34 | `make push--file docker/Makefile` 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2020, Free Law Project 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Doctor 3 | ------------------------------------ 4 | 5 | Welcome to Doctor, Free Law Project's microservice for converting, extracting and modifying documents and audio files. 6 | 7 | At a high level, this service provides you with high-performance HTTP endpoints that can: 8 | 9 | - Extract text from various types of documents 10 | - Convert audio files from one format to another while stripping messy metadata 11 | - Create thumbnails of PDFs 12 | - Provide metadata about PDFs 13 | 14 | Under the hood, Doctor uses gunicorn to connect to a django service. The django service uses 15 | carefully configured implementations of `ffmpeg`, `pdftotext`, `tesseract`, `ghostscript`, and a 16 | number of other converters. 17 | 18 | 19 | Quick Start 20 | ----------- 21 | 22 | Assuming you have docker installed run: 23 | 24 | docker run -d -p 5050:5050 freelawproject/doctor:latest 25 | 26 | This will expose the endpoints on port 5050 with one gunicorn worker. This is usually ideal because it allows you to horizontally scale Doctor using an orchestration system like Kubernetes. 27 | 28 | If you are not using a system that supports horizontal scaling, you may wish to have more gunicorn workers so that Doctor can handle more simultaneous tasks. To set that up, simply set the DOCTOR_WORKERS environment variable: 29 | 30 | docker run -d -p 5050:5050 -e DOCTOR_WORKERS=16 freelawproject/doctor:latest 31 | 32 | If you are doing OCR or audio conversion, scaling through a system like Kubernetes or through by giving Doctor many workers becomes particularly important. If it does not have a worker available, your call to Doctor will probably time out. 33 | 34 | After the image is running, you should be able to test that you have a working environment by running 35 | 36 | curl http://localhost:5050 37 | 38 | which should return a text response: 39 | 40 | Heartbeat detected. 41 | 42 | 43 | ENDPOINTS 44 | ------------- 45 | 46 | ## Overview 47 | 48 | The service currently supports the following tools: 49 | 50 | 1. Extract text from PDF, RTF, DOC, DOCX, or WPD, HTML, TXT files. 51 | 1. OCR text from a scanned PDF. 52 | 1. Get page count for a PDF document. 53 | 1. Check for bad redactions in a PDF document. 54 | 1. Convert audio files from wma, ogg, wav to MP3. 55 | 1. Create a thumbnail of the first page of a PDF (for use in Open Graph tags) 56 | 1. Convert an image or images to a PDF. 57 | 1. Identify the mime type of a file. 58 | 59 | 60 | A brief description and curl command for each endpoint is provided below. 61 | 62 | ## Extractors 63 | 64 | ### Endpoint: /extract/doc/text/ 65 | 66 | Given a document, extract out the text and assorted metadata. Supports the following document types: 67 | 68 | - `pdf` - Adobe portable document format files, via `pdftotext`. 69 | - `doc` - Word document files, via `antiword`. 70 | - `docx` - Open Office XML files, via `docx2txt`. 71 | - `html` - HTML files, via `lxml.html.clean.Cleaner`. Strips out dangerous tags and hoists their contents to their parent. Hoisted tags include: `a`, `body`, `font`, `noscript`, and `img`. 72 | - `txt` - Text files. This attempts to normalize all encoding questions to utf-8. First, we try cp1251, then utf-8, ignoring errors. 73 | - `wpd` - Word Perfect files, via `wpd2html` followed by cleaning the HTML as above. 74 | 75 | ```bash 76 | curl 'http://localhost:5050/extract/doc/text/' \ 77 | -X 'POST' \ 78 | -F "file=@doctor/test_assets/vector-pdf.pdf" 79 | ``` 80 | 81 | Parameters: 82 | 83 | - `ocr_available`: Whether doctor should use tesseract to provide OCR services for the document. OCR is always possible in doctor, but sometimes you won't want to use it, since it can be slow. If you want it disabled for this request, omit this optional parameter. To enable it, set ocr_available to `True`: 84 | 85 | ```bash 86 | curl 'http://localhost:5050/extract/doc/text/?ocr_available=True' \ 87 | -X 'POST' \ 88 | -F "file=@doctor/test_assets/image-pdf.pdf" 89 | ``` 90 | 91 | Magic: 92 | 93 | - The mimetype of the file will be determined by the name of the file you pass in. For example, if you pass in medical_assessment.pdf, the `pdf` extractor will be used. 94 | 95 | Valid requests will receive a JSON response with the following keys: 96 | 97 | - `content`: The utf-8 encoded text of the file 98 | - `err`: An error message, if one should occur. 99 | - `extension`: The sniffed extension of the file. 100 | - `extracted_by_ocr`: Whether OCR was needed and used during processing. 101 | - `page_count`: The number of pages, if it applies. 102 | 103 | ### Endpoint: /extract/recap/text/ 104 | 105 | Given a RECAP pdf, extract out the text using PDF Plumber, OCR or a combination of the two 106 | 107 | Parameters: 108 | 109 | - `strip_margin`: Whether doctor should crop the edges of the recap document during processing. With PDF plumber it will ignore traditional 1 inch margin. With an OCR it lowers the threshold for hiding OCR gibberish. To enable it, set strip_margin to `True`: 110 | 111 | ```bash 112 | curl 'http://localhost:5050/extract/recap/text/?strip_margin=True' \ 113 | -X 'POST' \ 114 | -F "file=@doctor/recap_extract/gov.uscourts.cacd.652774.40.0.pdf" 115 | ``` 116 | 117 | Valid requests will receive a JSON response with the following keys: 118 | 119 | - `content`: The utf-8 encoded text of the file 120 | - `extracted_by_ocr`: Whether OCR was needed and used during processing. 121 | 122 | 123 | ## Utilities 124 | 125 | ### Endpoint: /utils/page-count/pdf/ 126 | 127 | This method takes a document and returns the page count. 128 | 129 | curl 'http://localhost:5050/utils/page-count/pdf/' \ 130 | -X 'POST' \ 131 | -F "file=@doctor/test_assets/image-pdf.pdf" 132 | 133 | This will return an HTTP response with page count. In the above example it would return __2__. 134 | 135 | ### Endpoint: /utils/check-redactions/pdf/ 136 | 137 | This method takes a document and returns the bounding boxes of bad 138 | redactions as well as any discovered text. 139 | 140 | curl 'http://localhost:5050/utils/check-redactions/pdf/' \ 141 | -X 'POST' \ 142 | -F "file=@doctor/test_assets/x-ray/rectangles_yes.pdf" 143 | 144 | returns as JSON response with bounding box(es) and text recovered. 145 | ``` 146 | { 147 | "error": false, 148 | "results": { 149 | "1": [ 150 | { 151 | "bbox": [ 152 | 412.54998779296875, 153 | 480.6099853515625, 154 | 437.8699951171875, 155 | 494.39996337890625 156 | ], 157 | "text": "“No”" 158 | }, 159 | { 160 | "bbox": [ 161 | 273.3500061035156, 162 | 315, 163 | 536.8599853515625, 164 | 328.79998779296875 165 | ], 166 | "text": "“Yes”, but did not disclose all relevant medical history" 167 | }, 168 | { 169 | "bbox": [ 170 | 141.22999572753906, 171 | 232.20001220703125, 172 | 166.54998779296875, 173 | 246 174 | ], 175 | "text": "“No”" 176 | } 177 | ] 178 | } 179 | } 180 | ``` 181 | 182 | The "error" field is set if there was an issue processing the PDF. 183 | 184 | If "results" is empty there were no bad redactions found otherwise it 185 | is a list of bounding box along with the text recovered. 186 | 187 | See: https://github.com/freelawproject/x-ray/#readme 188 | 189 | ### Endpoint: /utils/mime-type/ 190 | 191 | This method takes a document and returns the mime type. 192 | 193 | curl 'http://localhost:5050/utils/mime-type/?mime=False' \ 194 | -X 'POST' \ 195 | -F "file=@doctor/test_assets/image-pdf.pdf" 196 | 197 | returns as JSON response identifying the document type 198 | 199 | {"mimetype": "PDF document, version 1.3"} 200 | 201 | and 202 | 203 | curl 'http://localhost:5050/utils/mime-type/?mime=True' \ 204 | -X 'POST' \ 205 | -F "file=@doctor/test_assets/image-pdf.pdf" 206 | 207 | returns as JSON response identifying the document type 208 | 209 | {"mimetype": "application/pdf"} 210 | 211 | Another example 212 | 213 | curl 'http://localhost:5050/utils/mime-type/?mime=True' \ 214 | -X 'POST' \ 215 | -F "file=@doctor/test_assets/word-doc.doc" 216 | 217 | returns 218 | 219 | {"mimetype": "application/msword"} 220 | 221 | This method is useful for identifying the type of document, incorrect documents and weird documents. 222 | 223 | ### Endpoint: /utils/add/text/pdf/ 224 | 225 | This method will take an image PDF and return the PDF with transparent text overlayed on the document. 226 | This allows users to copy and paste (more or less) from our OCRd text. 227 | 228 | curl 'http://localhost:5050/utils/add/text/pdf/' \ 229 | -X 'POST' \ 230 | -F "file=@doctor/test_assets/image-pdf.pdf" \ 231 | -o image-pdf-with-embedded-text.pdf 232 | 233 | ### Endpoint: /utils/audio/duration/ 234 | 235 | This endpoint returns the duration of an MP3 file. 236 | 237 | curl 'http://localhost:5050/utils/audio/duration/' \ 238 | -X 'POST' \ 239 | -F "file=@doctor/test_assets/1.mp3" 240 | 241 | ### Endpoint: /utils/document-number/pdf/ 242 | 243 | This method takes a document from the federal filing system and returns its document entry number. 244 | 245 | curl 'http://localhost:5050/utils/document-number/pdf/' \ 246 | -X 'POST' \ 247 | -F "file=@doctor/test_assets/recap_documents/ca2_1-1.pdf" 248 | 249 | This will return an HTTP response with the document number. In the above example it would return __1-1__. 250 | 251 | 252 | ## Converters 253 | 254 | ### Endpoint: /convert/image/pdf/ 255 | 256 | Given an image of indeterminate length, this endpoint will convert it to a pdf with reasonable page breaks. This is meant for extremely long images that represent multi-page documents, but can be used to convert a smaller image to a one-page PDF. 257 | 258 | curl 'http://localhost:5050/convert/image/pdf/' \ 259 | -X 'POST' \ 260 | -F "file=@doctor/test_assets/long-image.tiff" \ 261 | --output test-image-to-pdf.pdf 262 | 263 | Keep in mind that this curl will write the file to the current directory. 264 | 265 | ### Endpoint: /convert/images/pdf/ 266 | 267 | Given a list of urls for images, this endpoint will convert them to a pdf. This can be used to convert multiple images to a multi-page PDF. We use this to convert financial disclosure images to simple PDFs. 268 | 269 | curl 'http://localhost:5050/convert/images/pdf/?sorted_urls=%5B%22https%3A%2F%2Fcom-courtlistener-storage.s3-us-west-2.amazonaws.com%2Ffinancial-disclosures%2F2011%2FA-E%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11_Page_1.tiff%22%2C+%22https%3A%2F%2Fcom-courtlistener-storage.s3-us-west-2.amazonaws.com%2Ffinancial-disclosures%2F2011%2FA-E%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11_Page_2.tiff%22%5D' \ 270 | -X POST \ 271 | -o image.pdf 272 | 273 | This returns the binary data of the pdf. 274 | 275 | 276 | ### Endpoint: /convert/pdf/thumbnail/ 277 | 278 | Thumbnail takes a pdf and returns a png thumbnail of the first page. 279 | 280 | curl 'http://localhost:5050/convert/pdf/thumbnail/' \ 281 | -X 'POST' \ 282 | -F "file=@doctor/test_assets/image-pdf.pdf" \ 283 | -o test-thumbnail.png 284 | 285 | This returns the binary data of the thumbnail. 286 | 287 | Keep in mind that this curl will also write the file to the current directory. 288 | 289 | ### Endpoint: /convert/pdf/thumbnails/ 290 | 291 | Given a PDF and a range or pages, this endpoint will return a zip file containing thumbnails 292 | for each page requested. This endpoint also takes an optional parameter called max_dimension, 293 | this property scales the long side of each thumbnail (width for landscape pages, height for 294 | portrait pages) to fit in the specified number of pixels. 295 | 296 | For example if you want thumbnails for the first four pages: 297 | 298 | curl 'http://localhost:5050/convert/pdf/thumbnails/' \ 299 | -X 'POST' \ 300 | -F "file=@doctor/test_assets/vector-pdf.pdf" \ 301 | -F 'pages="[1,2,3,4]"' \ 302 | -F 'max_dimension=350' \ 303 | -o thumbnails.zip 304 | 305 | This will return four thumbnails in a zip file. 306 | 307 | ### Endpoint: /convert/audio/mp3/ 308 | 309 | This endpoint takes an audio file and converts it to an MP3 file. This is used to convert different audio formats 310 | from courts across the country and standardizes the format for our end users. 311 | 312 | This endpoint also adds the SEAL of the court to the MP3 file and updates the metadata to reflect our updates. 313 | 314 | curl 'http://localhost:5050/convert/audio/mp3/?audio_data=%7B%22court_full_name%22%3A+%22Testing+Supreme+Court%22%2C+%22court_short_name%22%3A+%22Testing+Supreme+Court%22%2C+%22court_pk%22%3A+%22test%22%2C+%22court_url%22%3A+%22http%3A%2F%2Fwww.example.com%2F%22%2C+%22docket_number%22%3A+%22docket+number+1+005%22%2C+%22date_argued%22%3A+%222020-01-01%22%2C+%22date_argued_year%22%3A+%222020%22%2C+%22case_name%22%3A+%22SEC+v.+Frank+J.+Custable%2C+Jr.%22%2C+%22case_name_full%22%3A+%22case+name+full%22%2C+%22case_name_short%22%3A+%22short%22%2C+%22download_url%22%3A+%22http%3A%2F%2Fmedia.ca7.uscourts.gov%2Fsound%2Fexternal%2Fgw.15-1442.15-1442_07_08_2015.mp3%22%7D' \ 315 | -X 'POST' \ 316 | -F "file=@doctor/test_assets/1.wma" 317 | 318 | This returns the audio file as a file response. 319 | 320 | ### Endpoint: /convert/audio/ogg/ 321 | 322 | This endpoint takes an audio file and converts it to an OGG file. The conversion process downsizes files by using 323 | a single audio channel and fixing the sampling rate to 8 kHz. 324 | 325 | This endpoint also optimizes the output for voice over IP applications. 326 | 327 | curl 'http://localhost:5050/convert/audio/ogg/' \ 328 | -X 'POST' \ 329 | -F "file=@doctor/test_assets/1.wma" 330 | 331 | This returns the audio file as a file response. 332 | 333 | 334 | ## Testing 335 | 336 | Testing is designed to be run with the `compose.yaml` file. To see more about testing 337 | checkout the DEVELOPING.md file. 338 | 339 | ## Sentry Logging 340 | 341 | For debugging purposes, it's possible to set your Sentry DSN to send events to Sentry. 342 | By default, no SENTRY_DSN is set and no events will be sent to Sentry. 343 | To use Sentry set the SENTRY_DSN environment variable to your DSN. Using Docker you can set it with: 344 | 345 | docker run -d -p 5050:5050 -e SENTRY_DSN= freelawproject/doctor:latest 346 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | You can find our VDP here: https://free.law/vulnerability-disclosure-policy/ 2 | -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | doctor: 3 | container_name: doctor 4 | build: 5 | dockerfile: docker/Dockerfile 6 | context: . 7 | args: 8 | options: --reload 9 | image: freelawproject/doctor:latest 10 | ports: 11 | - 5050:5050 12 | volumes: 13 | - .:/opt/app 14 | env_file: 15 | - .env.dev 16 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Note: Force M1 to emulate amd64 2 | FROM --platform=linux/amd64 python:3.10 3 | 4 | # Install uv 5 | # https://docs.astral.sh/uv/guides/integration/docker/#installing-uv 6 | COPY --from=ghcr.io/astral-sh/uv:0.7 /uv /uvx /bin/ 7 | 8 | # Install apt dependencies 9 | # caching: https://docs.docker.com/build/cache/optimize/#use-cache-mounts 10 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ 11 | --mount=type=cache,target=/var/lib/apt,sharing=locked \ 12 | apt-get update --quiet=2 && \ 13 | apt-get install -y --no-install-recommends apt-utils && \ 14 | apt-get install -y \ 15 | build-essential \ 16 | curl \ 17 | libjpeg-dev \ 18 | libleptonica-dev \ 19 | libtesseract-dev \ 20 | libz-dev \ 21 | poppler-utils \ 22 | qpdf \ 23 | tesseract-ocr \ 24 | && \ 25 | apt-get install \ 26 | --no-install-recommends \ 27 | --assume-yes \ 28 | --quiet=2 \ 29 | `# Document extraction and OCR tools` \ 30 | antiword \ 31 | docx2txt \ 32 | ghostscript \ 33 | libwpd-tools \ 34 | `# Audio extraction/manipulation tools` \ 35 | ffmpeg \ 36 | libmagic1 \ 37 | `# Image & OCR tools` \ 38 | imagemagick \ 39 | `# Other dependencies` \ 40 | libffi-dev \ 41 | libxml2-dev \ 42 | libxslt-dev 43 | 44 | # set environment variables 45 | ENV PYTHONDONTWRITEBYTECODE=1 \ 46 | PYTHONUNBUFFERED=1 \ 47 | # Disable tesseract multithreading for more scalable performance and 48 | # faster overall performance 49 | OMP_THREAD_LIMIT=1 50 | 51 | WORKDIR /code 52 | 53 | # Install Python dependencies 54 | COPY pyproject.toml uv.lock . 55 | # https://docs.astral.sh/uv/guides/integration/docker/#caching 56 | ENV UV_COMPILE_BYTECODE=1 \ 57 | UV_LINK_MODE=copy \ 58 | UV_PROJECT_ENVIRONMENT=/venv \ 59 | PATH="/venv/bin:$PATH" 60 | RUN --mount=type=cache,target=/root/.cache/uv \ 61 | uv sync 62 | 63 | COPY . . 64 | 65 | EXPOSE 5050 66 | 67 | ARG options 68 | ENV OPTIONS $options 69 | 70 | CMD gunicorn $OPTIONS doctor.wsgi:application \ 71 | --workers ${DOCTOR_WORKERS:-1} \ 72 | --max-requests 1000 \ 73 | --max-requests-jitter 100 \ 74 | --timeout 5400 \ 75 | --bind 0.0.0.0:5050 76 | -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | # Run with make push --file docker/Makefile -e VERSION=$(git rev-parse --short HEAD) 2 | # Note that makefiles differentiate between tabs and spaces in a weird way! 3 | 4 | # Ensure VERSION is set. 5 | ifndef VERSION 6 | $(error VERSION variable is not set. Use -e VERSION=XYZ to proceed.) 7 | endif 8 | 9 | DOCKER_REPOSITORY ?= freelawproject/doctor 10 | 11 | DOCKER ?= docker 12 | export DOCKER 13 | 14 | .PHONY: all image push multiarch_push multiarch_image 15 | 16 | UNAME := $(shell uname -m) 17 | 18 | all: image 19 | 20 | image: 21 | $(DOCKER) build -t $(DOCKER_REPOSITORY):$(VERSION) -t $(DOCKER_REPOSITORY):latest --file docker/Dockerfile . 22 | 23 | push: image 24 | $(info Checking if valid architecture) 25 | @if [ $(UNAME) = "x86_64" ]; then \ 26 | echo "Architecture is OK. Pushing.";\ 27 | $(DOCKER) push $(DOCKER_REPOSITORY):$(VERSION);\ 28 | $(DOCKER) push $(DOCKER_REPOSITORY):latest;\ 29 | else \ 30 | echo "Only arm64 machines can push single-architecture builds. If you want to \ 31 | push a build, try 'make multiarch_push', which builds for both arm64 and amd64. This \ 32 | protects against arm64 builds being accidentally deployed to the server (which uses arm64).";\ 33 | fi 34 | 35 | multiarch_image: 36 | export DOCKER_CLI_EXPERIMENTAL=enabled 37 | $(DOCKER) buildx rm 38 | $(DOCKER) buildx create --use --name flp-builder 39 | $(DOCKER) buildx build --platform linux/amd64,linux/arm64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile . 40 | 41 | multiarch_push: multiarch_image 42 | $(DOCKER) buildx build --push --platform linux/amd64,linux/arm64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile . 43 | 44 | x86_push: 45 | export DOCKER_CLI_EXPERIMENTAL=enabled 46 | $(DOCKER) buildx rm 47 | $(DOCKER) buildx create --use --name flp-builder 48 | $(DOCKER) buildx build --push --platform linux/amd64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile . 49 | -------------------------------------------------------------------------------- /doctor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/__init__.py -------------------------------------------------------------------------------- /doctor/assets/producer-300x300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/assets/producer-300x300.png -------------------------------------------------------------------------------- /doctor/forms.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | import uuid 4 | 5 | from django import forms 6 | from django.core.exceptions import ValidationError 7 | from django.core.validators import FileExtensionValidator 8 | 9 | 10 | class BaseAudioFile(forms.Form): 11 | file = forms.FileField(label="document", required=True) 12 | 13 | 14 | class BaseFileForm(forms.Form): 15 | """""" 16 | 17 | file = forms.FileField(label="document", required=True) 18 | 19 | def temp_save_file(self, fp): 20 | with open(fp, "wb") as f: 21 | for chunk in self.cleaned_data["file"].chunks(): 22 | f.write(chunk) 23 | 24 | def clean_file(self): 25 | file = self.cleaned_data.get("file", False) 26 | if not file: 27 | raise ValidationError("File is missing.") 28 | self.cleaned_data["extension"] = file.name.split(".")[-1] 29 | self.cleaned_data["original_filename"] = file.name 30 | self.prep_file() 31 | return file 32 | 33 | def prep_file(self): 34 | with tempfile.NamedTemporaryFile( 35 | delete=False, suffix=f".{self.cleaned_data['extension']}" 36 | ) as fp: 37 | self.cleaned_data["tmp_dir"] = tempfile.TemporaryDirectory() 38 | self.cleaned_data["fp"] = fp.name 39 | self.temp_save_file(fp.name) 40 | 41 | 42 | class AudioForm(BaseAudioFile): 43 | """""" 44 | 45 | audio_data = forms.JSONField(label="audio-data", required=False) 46 | 47 | def clean(self): 48 | self.cleaned_data["fp"] = f"/tmp/audio_{uuid.uuid4().hex}" 49 | if self.cleaned_data.get("file", None): 50 | filename = self.cleaned_data["file"].name 51 | self.cleaned_data["extension"] = filename.split(".")[-1] 52 | return self.cleaned_data 53 | 54 | 55 | class ImagePdfForm(forms.Form): 56 | sorted_urls = forms.CharField(required=True, label="sorted-urls") 57 | 58 | def clean(self): 59 | self.cleaned_data["sorted_urls"] = json.loads( 60 | self.cleaned_data["sorted_urls"] 61 | ) 62 | return self.cleaned_data 63 | 64 | 65 | class MimeForm(forms.Form): 66 | file = forms.FileField(label="document", required=False) 67 | mime = forms.BooleanField(label="mime", required=False) 68 | 69 | def clean(self): 70 | file = self.cleaned_data.get("file", False) 71 | if not file: 72 | raise ValidationError("File is missing.") 73 | 74 | self.cleaned_data["filename"] = "unknown" 75 | 76 | 77 | class ThumbnailForm(forms.Form): 78 | file = forms.FileField( 79 | label="document", 80 | required=True, 81 | validators=[FileExtensionValidator(["pdf"])], 82 | ) 83 | max_dimension = forms.IntegerField(label="max-dimension", required=False) 84 | pages = forms.Field(label="pages", required=False) 85 | 86 | def clean(self): 87 | """""" 88 | if self.cleaned_data.get("pages"): 89 | self.cleaned_data["pages"] = json.loads(self.cleaned_data["pages"]) 90 | 91 | if not self.cleaned_data["max_dimension"]: 92 | self.cleaned_data["max_dimension"] = 350 93 | return self.cleaned_data 94 | 95 | 96 | class DocumentForm(BaseFileForm): 97 | ocr_available = forms.BooleanField(label="ocr-available", required=False) 98 | mime = forms.BooleanField(label="mime", required=False) 99 | strip_margin = forms.BooleanField(label="strip-margin", required=False) 100 | 101 | def clean(self): 102 | self.clean_file() 103 | return self.cleaned_data 104 | -------------------------------------------------------------------------------- /doctor/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/lib/__init__.py -------------------------------------------------------------------------------- /doctor/lib/mojibake.py: -------------------------------------------------------------------------------- 1 | from django.utils.encoding import smart_str 2 | 3 | 4 | def fix_mojibake(text): 5 | """Given corrupt text from pdffactory, converts it to sane text.""" 6 | 7 | letter_map = { 8 | "¿": "a", 9 | "¾": "b", 10 | "½": "c", 11 | "¼": "d", 12 | "»": "e", 13 | "º": "f", 14 | "¹": "g", 15 | "¸": "h", 16 | "·": "i", 17 | "¶": "j", 18 | "μ": "k", 19 | "´": "l", 20 | "³": "m", 21 | "²": "n", 22 | "±": "o", 23 | "°": "p", 24 | "¯": "q", 25 | "®": "r", 26 | "-": "s", 27 | "¬": "t", 28 | "«": "u", 29 | "ª": "v", 30 | "©": "w", 31 | "¨": "x", 32 | "§": "y", 33 | "¦": "z", 34 | "ß": "A", 35 | "Þ": "B", 36 | "Ý": "C", 37 | "Ü": "D", 38 | "Û": "E", 39 | "Ú": "F", 40 | "Ù": "G", 41 | "Ø": "H", 42 | "×": "I", 43 | "Ö": "J", 44 | "Õ": "K", 45 | "Ô": "L", 46 | "Ó": "M", 47 | "Ò": "N", 48 | "Ñ": "O", 49 | "Ð": "P", 50 | "Î": "R", 51 | "Í": "S", 52 | "Ì": "T", 53 | "Ë": "U", 54 | "Ê": "V", 55 | "É": "W", 56 | "": "X", # Missing 57 | "Ç": "Y", 58 | "Æ": "Z", 59 | "ð": "0", 60 | "ï": "1", 61 | "î": "2", 62 | "í": "3", 63 | "ì": "4", 64 | "ë": "5", 65 | "ê": "6", 66 | "é": "7", 67 | "è": "8", 68 | "ç": "9", 69 | "ò": ".", 70 | "ô": ",", 71 | "æ": ":", 72 | "å": ";", 73 | "Ž": "'", 74 | "•": "'", # s/b double quote, but identical to single. 75 | "Œ": "'", # s/b double quote, but identical to single. 76 | "ó": "-", # dash 77 | "Š": "-", # n-dash 78 | "‰": "--", # em-dash 79 | "ú": "&", 80 | "ö": "*", 81 | "ñ": "/", 82 | "÷": ")", 83 | "ø": "(", 84 | "Å": "[", 85 | "Ã": "]", 86 | "‹": "•", 87 | } 88 | 89 | plaintext = "" 90 | for letter in text: 91 | try: 92 | plaintext += letter_map[letter] 93 | except KeyError: 94 | try: 95 | plaintext += smart_str(letter) 96 | except UnicodeEncodeError: 97 | continue 98 | 99 | return plaintext 100 | -------------------------------------------------------------------------------- /doctor/lib/text_extraction.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | import pdfplumber 5 | import pytesseract 6 | from pdfplumber.ctm import CTM 7 | from PIL import Image 8 | from pytesseract import Output 9 | 10 | 11 | def is_skewed(obj: dict) -> bool: 12 | """Check if a PDF plumber dict is skewed 13 | 14 | CTM stands for current transformation matrix. 15 | Pdf plumber has a method to calculate the angle of text which we use here 16 | 17 | Traditionally this is only seen in circular stamps which confuses the 18 | content, or in perpendicular text of the ninth circuit courts which also 19 | confuses the text. 20 | 21 | :param obj: dictionary from pdfplumber for each word 22 | :return: if the text should be returned 23 | """ 24 | if (matrix := obj.get("matrix")) is None: 25 | return True 26 | 27 | # Remove Skew 28 | my_char_ctm = CTM(*matrix) 29 | return my_char_ctm.skew_x == 0 30 | 31 | 32 | def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str: 33 | """Extract page text 34 | 35 | Using pdf plumber extract out the text of the document that is not 36 | skewed (ie a stamp of approval) and extract out text removing blue text 37 | 38 | Strip margin refers only to top and bottom margin here 39 | 40 | :param page: PdfPlumber page 41 | :param strip_margin: a flag to crop out the margin of a document and skewed content 42 | :return: Text from the pdf plumber page 43 | """ 44 | _, _, width, height = page.bbox 45 | if strip_margin and (height > width): 46 | # Crop margins and remove skewed text 47 | pixels_per_inch = width / 8.5 48 | bbox = ( 49 | 0, 50 | pixels_per_inch * 1, # 1 inch down from top 51 | width, # 52 | pixels_per_inch * 10, # 10 inches from top (1 inch from bottom) 53 | ) 54 | page_text = ( 55 | page.crop(bbox) 56 | .filter(is_skewed) 57 | .extract_text( 58 | layout=True, 59 | keep_blank_chars=True, 60 | y_tolerance=5, 61 | y_density=25, 62 | ) 63 | ) 64 | else: 65 | page_text = page.extract_text( 66 | layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25 67 | ) 68 | page_text = remove_excess_whitespace(page_text) 69 | return page_text 70 | 71 | 72 | def has_images(page: pdfplumber.pdf.Page) -> bool: 73 | """Does the page have images that are large enough to contain text 74 | 75 | :param page: pdf plumber page 76 | :return: True if page contains images of a certain size 77 | """ 78 | return any( 79 | image 80 | for image in page.images 81 | if image["width"] > 10 and image["height"] > 10 82 | ) 83 | 84 | 85 | def has_text_annotations(page: pdfplumber.pdf.Page) -> bool: 86 | """Does the page have annotations which could contain text 87 | 88 | :param page: pdf plumber 89 | :return: if page has annotations 90 | """ 91 | if page.annots: 92 | anno_types = [ 93 | str(annot.get("data").get("Subtype")) for annot in page.annots 94 | ] 95 | if "/'FreeText'" in anno_types or "/'Widget'" in anno_types: 96 | return True 97 | return False 98 | 99 | 100 | def adjust_caption_lines(page_text: str) -> str: 101 | """Adjust the alignment of ) or : or § used to align content 102 | 103 | § is used in texas courts 104 | : is used in NY courts 105 | ) is used in many courts 106 | 107 | :param page_text: The text of the first page 108 | :return: The page text 109 | """ 110 | for separator in [r")", "§", ":"]: 111 | pattern = rf"(.* +{re.escape(separator)} .*\n)" 112 | matches = list(re.finditer(pattern, page_text)) 113 | central_matches = [ 114 | match.group().rindex(separator) 115 | for match in matches 116 | if 30 <= match.group().rindex(separator) <= 70 117 | ] 118 | if len(central_matches) < 3: 119 | continue # Skip this separator if less than 3 matches found 120 | # Determine the longest position of the separator 121 | longest = max(central_matches) 122 | page = [] 123 | for row in page_text.splitlines(): 124 | index = row.find(f" {separator}") 125 | addition = (longest - index) * " " 126 | row = row.replace(f" {separator}", f"{addition}{separator}") 127 | page.append(row) 128 | return "\n".join(page) 129 | return page_text 130 | 131 | 132 | def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool: 133 | """Does the page need OCR 134 | 135 | :param page:Pdf Plumber Page 136 | :param page_text: context extracted from page 137 | :return: does page need OCR 138 | """ 139 | return ( 140 | page_text.strip() == "" 141 | or "(cid:" in page_text 142 | or has_text_annotations(page) 143 | or has_images(page) 144 | or len(page.curves) > 10 145 | ) 146 | 147 | 148 | def convert_pdf_page_to_image( 149 | page: pdfplumber.pdf.Page, strip_margin: bool 150 | ) -> Image: 151 | """Convert page to image and crop margin if applicable 152 | 153 | :param page: the pdf page 154 | :param strip_margin: whether to crop the margin 155 | :return: The cropped page image 156 | """ 157 | img = page.to_image(resolution=300) 158 | _, _, w, h = page.bbox 159 | width = w * img.scale 160 | 161 | if strip_margin: 162 | pixels_per_inch = width / 8.5 163 | bbox = ( 164 | pixels_per_inch * 0.5, # .5" from left edge 165 | pixels_per_inch * 0.5, # .5" down from top 166 | pixels_per_inch * 8, # 8" from left edge (.5" from right) 167 | pixels_per_inch * 10.5, # 10.5" from top (.5" from bottom) 168 | ) 169 | image = img.original.crop(bbox) 170 | else: 171 | image = img.original 172 | return image 173 | 174 | 175 | def ocr_image_to_data(image: Image) -> list[pd.DataFrame]: 176 | """Perform OCR on an image to extract data 177 | 178 | Convert the image of the pdf page to OCR data 179 | :param image: Pil Image 180 | :return: A list of DataFrames, each containing OCR data for a block of text 181 | """ 182 | 183 | # Detailed Parameters for `pytesseract.image_to_data`: 184 | # - config: str 185 | # Additional Tesseract configuration options. 186 | # - `-c preserve_interword_spaces=1`: Preserve spaces between words as they appear in the image. 187 | # - `-c tessedit_do_invert=0`: Do not invert the image colors. 188 | # - `--psm 6`: Page segmentation mode 6, which assumes a single uniform block of text. 189 | # - `-l eng`: Use the English language for OCR. 190 | # - output_type: pytesseract.Output.DICT 191 | # Specifies that the output should be a dictionary of OCR data. 192 | # 193 | # Reference: 194 | # Tesseract OCR documentation: https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc 195 | 196 | data_dict = pytesseract.image_to_data( 197 | image, 198 | config="-c preserve_interword_spaces=1x1 -c tessedit_do_invert=0 --psm 6 -l eng", 199 | output_type=Output.DICT, 200 | ) 201 | df = pd.DataFrame(data_dict) 202 | filtered_data = df[(df.conf != -1)] 203 | block_ids = ( 204 | filtered_data.groupby("block_num") 205 | .first() 206 | .sort_values("top") 207 | .index.tolist() 208 | ) 209 | blocks = [ 210 | filtered_data[filtered_data["block_num"] == block] 211 | for block in block_ids 212 | ] 213 | return blocks 214 | 215 | 216 | def extract_with_ocr(page: pdfplumber.pdf.Page, strip_margin: bool) -> str: 217 | """Extract the page using OCR 218 | 219 | :param page:Pdf Plumber Page 220 | :param strip_margin: If we should trim the margins 221 | :return: The extracted content for the page 222 | """ 223 | 224 | image = convert_pdf_page_to_image(page, strip_margin) 225 | data = ocr_image_to_data(image) 226 | content = "" 227 | prev = {} 228 | for words in data: 229 | for _, word in words.iterrows(): 230 | content = insert_whitespace(content, word, prev) 231 | content += get_word(word, image.size[0], strip_margin) 232 | prev = word 233 | content = cleanup_content(content, page.page_number) 234 | return content 235 | 236 | 237 | def insert_whitespace(content: str, word: dict, prev: dict) -> str: 238 | """Insert whitespace after or before word 239 | 240 | :param content: The text extracted so far 241 | :param word: The OCR extraction object 242 | :param prev: The previous word object extracted 243 | :return: The content with the whitespace appended 244 | """ 245 | is_new_line = prev.get("line_num", 0) != word["line_num"] 246 | is_new_par = prev.get("par_num", 0) != word["par_num"] 247 | prev_end = prev.get("left", 1) + prev.get("width", 1) 248 | 249 | # Add vertical whitespace 250 | if is_new_line or is_new_par: 251 | vertical_gap = word["top"] - ( 252 | prev.get("top", 0) + prev.get("height", 0) 253 | ) 254 | content += "\n\n" if vertical_gap > 100 else "\n" 255 | prev_end = 0 256 | 257 | # add horizontal whitespace 258 | content += " " * int((word["left"] - prev_end) / 25) 259 | return content 260 | 261 | 262 | def get_word(word_dict: dict, width: float, strip_margin: bool) -> str: 263 | """Append word to content 264 | 265 | This function determines if a word should be added to the page content 266 | and adds the word. 267 | 268 | :param word_dict: the word object from tesseract 269 | :param width: The width of the document 270 | :param strip_margin: should we strip the margin 271 | :return: The text with space 272 | """ 273 | pixels_per_inch = width / 8.5 274 | if strip_margin: 275 | left_margin = 1 * pixels_per_inch # 276 | right_margin = 7.5 * pixels_per_inch 277 | else: 278 | left_margin = 0.5 * pixels_per_inch 279 | right_margin = 8.0 * pixels_per_inch 280 | 281 | # tesseract provides confidence values for its OCR outputs. We use those 282 | # confidence values to determine if something is a good OCR output, a 283 | # likely artifact and should be excluded or is bad ocr but not an artifact. 284 | 285 | word = word_dict["text"] 286 | conf = word_dict["conf"] 287 | 288 | no_confidence = 0 289 | very_low_confidence = 5 290 | low_confidence = 40 291 | short_word_len = 3 292 | long_word_len = 20 293 | if ( 294 | word_dict["left"] + word_dict["width"] < left_margin 295 | and conf < low_confidence 296 | ): 297 | # If a word has confidence below 40, a number that usually equates to 3 to 5 298 | # standard deviations from confidences found in other words is entirely in the 299 | # margin of the page - its likely an artifact as well. 300 | word = " " * len(word) 301 | elif (conf == no_confidence and len(word) <= short_word_len) or word_dict[ 302 | "left" 303 | ] == 0: 304 | # If a word has a zero confidence or starts on the left most edge of the paper 305 | # we return it as an empty string. It is likely an artifact. 306 | word = " " * len(word) 307 | elif conf < very_low_confidence and ( 308 | len(word) <= short_word_len or len(word) > long_word_len 309 | ): 310 | # If a confidence is below 5 - for a very short word - or for a very long word 311 | # its likely part of the document but we have no idea so we return a square 312 | # box to indicate that. This is often caused by stamps or lines in case captions 313 | word = "□" * len(word) 314 | elif conf < low_confidence and word_dict["left"] > right_margin: 315 | # Finally if a low confidence word starts in the right margin - its likely a 316 | # bad OCR that is multiple standard deviations away so we return the word as 317 | # empty squares. 318 | word = "□" * len(word) 319 | 320 | return f"{word} " 321 | 322 | 323 | def cleanup_content(content: str, page_number: int) -> str: 324 | """Reduce legal document line clutter 325 | 326 | This function performs several operations to clean up the text extracted from legal documents: 327 | 328 | 1. On the first page, it smooths out vertical lines if they are detected. 329 | 2. It removes pipes ('|') that might start a line repeatedly. 330 | 3. It removes artifacts that appear at the end of a line of text, specifically single characters 331 | following at least 10 whitespace characters, reducing right margin edge artifacts. 332 | 4. It removes excess left margin whitespace to improve readability and formatting. 333 | 334 | Example: 335 | If the pipes below represent the page edge (not characters): 336 | | we can remove the 337 | | the left whitespace 338 | | and shift this entire 339 | | page over four characters 340 | | which keeps formatting and 341 | | makes the text easier to 342 | | read and process with the API. 343 | 344 | :param content: the page content extracted 345 | :param page_number: the page number 346 | :return: the cleaned up text 347 | """ 348 | # remove floating pipes 349 | pattern = r"\s{4,}\| $" 350 | # Substitute the matched pipe with an empty string 351 | content = re.sub(pattern, "", content, flags=re.MULTILINE) 352 | 353 | # remove floating artifacts from the right side 354 | pattern = r"\s{10,}[a-zA-Z0-9|] $" 355 | content = re.sub(pattern, "", content, flags=re.MULTILINE) 356 | 357 | # shift text left if possible and remove excess start and end whitespace 358 | content = remove_excess_whitespace(content) 359 | if page_number == 1: 360 | content = adjust_caption_lines(content) 361 | 362 | return f"{content}\n" 363 | 364 | 365 | def remove_excess_whitespace(document: str) -> str: 366 | """Remove excess whitespace from OCR 367 | 368 | This function removes empty lines of text at the start and end of a document 369 | and shifts the page left if possible 370 | 371 | :param document: text of the document 372 | :return: Document with excess whitespace removed 373 | """ 374 | m = re.findall(r"(^ +)", document, re.MULTILINE) 375 | if m: 376 | shift_left = len(min(m)) 377 | pattern = f"(^ {{{shift_left}}})" 378 | document = re.sub(pattern, "", document, flags=re.MULTILINE) 379 | document = re.sub(r"^ +$", "", document, flags=re.MULTILINE) 380 | return document.strip("\n") 381 | -------------------------------------------------------------------------------- /doctor/lib/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import io 3 | import logging 4 | import os 5 | import re 6 | import subprocess 7 | import warnings 8 | from collections import namedtuple 9 | from decimal import Decimal 10 | from pathlib import Path 11 | from typing import Any 12 | 13 | import six 14 | from PyPDF2 import PdfMerger 15 | from reportlab.pdfgen import canvas 16 | 17 | 18 | class DoctorUnicodeDecodeError(UnicodeDecodeError): 19 | def __init__(self, obj, *args): 20 | self.obj = obj 21 | UnicodeDecodeError.__init__(self, *args) 22 | 23 | def __str__(self): 24 | original = UnicodeDecodeError.__str__(self) 25 | return f"{original}. You passed in {self.obj!r} ({type(self.obj)})" 26 | 27 | 28 | def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"): 29 | """ 30 | Similar to smart_bytes, except that lazy instances are resolved to 31 | strings, rather than kept as lazy objects. 32 | 33 | If strings_only is True, don't convert (some) non-string-like objects. 34 | """ 35 | # Handle the common case first for performance reasons. 36 | if isinstance(s, bytes): 37 | if encoding == "utf-8": 38 | return s 39 | else: 40 | return s.decode("utf-8", errors).encode(encoding, errors) 41 | if strings_only and is_protected_type(s): 42 | return s 43 | if isinstance(s, six.memoryview): 44 | return bytes(s) 45 | if isinstance(s, Promise): 46 | return six.text_type(s).encode(encoding, errors) 47 | if not isinstance(s, six.string_types): 48 | try: 49 | if six.PY3: 50 | return six.text_type(s).encode(encoding) 51 | else: 52 | return bytes(s) 53 | except UnicodeEncodeError: 54 | if isinstance(s, Exception): 55 | # An Exception subclass containing non-ASCII data that doesn't 56 | # know how to print itself properly. We shouldn't raise a 57 | # further exception. 58 | return b" ".join( 59 | force_bytes(arg, encoding, strings_only, errors) 60 | for arg in s 61 | ) 62 | return six.text_type(s).encode(encoding, errors) 63 | else: 64 | return s.encode(encoding, errors) 65 | 66 | 67 | def force_text(s, encoding="utf-8", strings_only=False, errors="strict"): 68 | """ 69 | Similar to smart_text, except that lazy instances are resolved to 70 | strings, rather than kept as lazy objects. 71 | 72 | If strings_only is True, don't convert (some) non-string-like objects. 73 | """ 74 | # Handle the common case first for performance reasons. 75 | if issubclass(type(s), six.text_type): 76 | return s 77 | if strings_only and is_protected_type(s): 78 | return s 79 | try: 80 | if not issubclass(type(s), six.string_types): 81 | if six.PY3: 82 | if isinstance(s, bytes): 83 | s = six.text_type(s, encoding, errors) 84 | else: 85 | s = six.text_type(s) 86 | elif hasattr(s, "__unicode__"): 87 | s = six.text_type(s) 88 | else: 89 | s = six.text_type(bytes(s), encoding, errors) 90 | else: 91 | # Note: We use .decode() here, instead of six.text_type(s, encoding, 92 | # errors), so that if s is a SafeBytes, it ends up being a 93 | # SafeText at the end. 94 | s = s.decode(encoding, errors) 95 | except UnicodeDecodeError as e: 96 | if not isinstance(s, Exception): 97 | raise DoctorUnicodeDecodeError(s, *e.args) 98 | else: 99 | # If we get to here, the caller has passed in an Exception 100 | # subclass populated with non-ASCII bytestring data without a 101 | # working unicode method. Try to handle this without raising a 102 | # further exception by individually forcing the exception args 103 | # to unicode. 104 | s = " ".join( 105 | force_text(arg, encoding, strings_only, errors) for arg in s 106 | ) 107 | return s 108 | 109 | 110 | def smart_text(s, encoding="utf-8", strings_only=False, errors="strict"): 111 | """ 112 | Returns a text object representing 's' -- unicode on Python 2 and str on 113 | Python 3. Treats bytestrings using the 'encoding' codec. 114 | 115 | If strings_only is True, don't convert (some) non-string-like objects. 116 | """ 117 | if isinstance(s, Promise): 118 | # The input is the result of a gettext_lazy() call. 119 | return s 120 | return force_text(s, encoding, strings_only, errors) 121 | 122 | 123 | class Promise: 124 | """ 125 | This is just a base class for the proxy class created in 126 | the closure of the lazy function. It can be used to recognize 127 | promises in code. 128 | """ 129 | 130 | pass 131 | 132 | 133 | _PROTECTED_TYPES = six.integer_types + ( 134 | type(None), 135 | float, 136 | Decimal, 137 | datetime.datetime, 138 | datetime.date, 139 | datetime.time, 140 | ) 141 | 142 | 143 | def is_protected_type(obj): 144 | """Determine if the object instance is of a protected type. 145 | 146 | Objects of protected types are preserved as-is when passed to 147 | force_text(strings_only=True). 148 | """ 149 | return isinstance(obj, _PROTECTED_TYPES) 150 | 151 | 152 | def audio_encoder(data): 153 | return namedtuple("AudioFile", data.keys())(*data.values()) 154 | 155 | 156 | def ignore_warnings(test_func): 157 | def do_test(self, *args, **kwargs): 158 | with warnings.catch_warnings(): 159 | warnings.simplefilter("ignore", ResourceWarning) 160 | warnings.simplefilter("ignore", DeprecationWarning) 161 | test_func(self, *args, **kwargs) 162 | 163 | return do_test 164 | 165 | 166 | def make_png_thumbnail_for_instance(filepath, max_dimension): 167 | """Abstract function for making a thumbnail for a PDF 168 | 169 | See helper functions below for how to use this in a simple way. 170 | 171 | :param filepath: The attr where the PDF is located on the item 172 | :param max_dimension: The longest you want any edge to be 173 | :param response: Flask response object 174 | """ 175 | command = [ 176 | "pdftoppm", 177 | "-singlefile", 178 | "-f", 179 | "1", 180 | "-scale-to", 181 | str(max_dimension), 182 | filepath, 183 | "-png", 184 | ] 185 | p = subprocess.Popen( 186 | command, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE 187 | ) 188 | stdout, stderr = p.communicate() 189 | return stdout, stderr.decode("utf-8"), str(p.returncode) 190 | 191 | 192 | def make_png_thumbnails(filepath, max_dimension, pages, directory): 193 | """Abstract function for making a thumbnail for a PDF 194 | 195 | See helper functions below for how to use this in a simple way. 196 | 197 | :param filepath: The attr where the PDF is located on the item 198 | :param max_dimension: The longest you want any edge to be 199 | :param response: Flask response object 200 | """ 201 | for page in pages: 202 | command = [ 203 | "pdftoppm", 204 | "-singlefile", 205 | "-f", 206 | str(page), 207 | "-scale-to", 208 | str(max_dimension), 209 | filepath, 210 | "-png", 211 | f"{directory.name}/thumb-{page}", 212 | ] 213 | p = subprocess.Popen( 214 | command, 215 | close_fds=True, 216 | stdout=subprocess.PIPE, 217 | stderr=subprocess.PIPE, 218 | ) 219 | p.communicate() 220 | 221 | 222 | def pdf_bytes_from_image_array(image_list, output_path) -> None: 223 | """Make a pdf given an array of Image files 224 | 225 | :param image_list: List of images 226 | :type image_list: list 227 | :return: pdf_data 228 | :type pdf_data: PDF as bytes 229 | """ 230 | image_list[0].save( 231 | output_path, 232 | "PDF", 233 | resolution=100.0, 234 | save_all=True, 235 | append_images=image_list[1:], 236 | ) 237 | del image_list 238 | 239 | 240 | def strip_metadata_from_path(file_path): 241 | """Convert PDF file into PDF and remove metadata from it 242 | 243 | Stripping the metadata allows us to hash the PDFs 244 | 245 | :param pdf_bytes: PDF as binary content 246 | :return: PDF bytes with metadata removed. 247 | """ 248 | with open(file_path, "rb") as f: 249 | pdf_merger = PdfMerger() 250 | pdf_merger.append(io.BytesIO(f.read())) 251 | pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""}) 252 | byte_writer = io.BytesIO() 253 | pdf_merger.write(byte_writer) 254 | return force_bytes(byte_writer.getvalue()) 255 | 256 | 257 | def strip_metadata_from_bytes(pdf_bytes): 258 | """Convert PDF bytes into PDF and remove metadata from it 259 | 260 | Stripping the metadata allows us to hash the PDFs 261 | 262 | :param pdf_bytes: PDF as binary content 263 | :return: PDF bytes with metadata removed. 264 | """ 265 | pdf_merger = PdfMerger() 266 | pdf_merger.append(io.BytesIO(pdf_bytes)) 267 | pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""}) 268 | byte_writer = io.BytesIO() 269 | pdf_merger.write(byte_writer) 270 | return force_bytes(byte_writer.getvalue()) 271 | 272 | 273 | def cleanup_form(form): 274 | """Clean up a form object""" 275 | os.remove(form.cleaned_data["fp"]) 276 | 277 | 278 | def make_file(filename, dir=None): 279 | filepath = f"{Path.cwd()}/doctor/test_assets/{filename}" 280 | with open(filepath, "rb") as f: 281 | return {"file": (filename, f.read())} 282 | 283 | 284 | def make_buffer(filename, dir=None): 285 | filepath = f"{Path.cwd()}/doctor/test_assets/{filename}" 286 | with open(filepath, "rb") as f: 287 | return {"file": ("filename", f.read())} 288 | 289 | 290 | def pdf_has_images(path: str) -> bool: 291 | """Check raw PDF for embedded images. 292 | 293 | We need to check if a PDF contains any images. If a PDF contains images it 294 | likely has content that needs to be scanned. 295 | 296 | :param path: Location of PDF to process. 297 | :return: Does the PDF contain images? 298 | :type: bool 299 | """ 300 | with open(path, "rb") as pdf_file: 301 | pdf_bytes = pdf_file.read() 302 | return bool(re.search(rb"/Image ?", pdf_bytes)) 303 | 304 | 305 | def ocr_needed(path: str, content: str) -> bool: 306 | """Check if OCR is needed on a PDF 307 | 308 | Check if images are in PDF or content is empty. 309 | 310 | :param path: The path to the PDF 311 | :param content: The content extracted from the PDF. 312 | :return: Whether OCR should be run on the document. 313 | """ 314 | return content.strip() == "" or pdf_has_images(path) 315 | 316 | 317 | def make_page_with_text(page, data, h, w): 318 | """Make a page with text 319 | 320 | :param page: 321 | :param data: 322 | :param h: 323 | :param w: 324 | :return: 325 | """ 326 | packet = io.BytesIO() 327 | can = canvas.Canvas(packet, pagesize=(w, h)) 328 | # Set to a standard size and font for now. 329 | can.setFont("Helvetica", 9) 330 | # Make the text transparent 331 | can.setFillAlpha(0) 332 | for i in range(len(data["level"])): 333 | try: 334 | letter, (x, y, _, hh), pg = ( 335 | data["text"][i], 336 | ( 337 | data["left"][i], 338 | data["top"][i], 339 | data["width"][i], 340 | data["height"][i], 341 | ), 342 | data["page_num"][i], 343 | ) 344 | except Exception: 345 | continue 346 | # Adjust the text to an 8.5 by 11 inch page 347 | sub = ((11 * 72) / h) * int(hh) 348 | x = ((8.5 * 72) / w) * int(x) 349 | y = ((11 * 72) / h) * int(y) 350 | yy = (11 * 72) - y 351 | if int(page) == int(pg): 352 | can.drawString(x, yy - sub, letter) 353 | can.showPage() 354 | can.save() 355 | packet.seek(0) 356 | return packet 357 | 358 | 359 | def log_sentry_event( 360 | logger: logging.Logger, 361 | level: int, 362 | message: str, 363 | extra: dict[str, Any] | None = None, 364 | **kwargs: Any, 365 | ) -> None: 366 | """ 367 | Logs a message using a specified logger, level, message, and optional extra data. 368 | 369 | :param logger: The logger instance to use (e.g., logging.getLogger(__name__)). 370 | :param level: The logging level (e.g., logging.INFO, logging.WARNING, logging.ERROR). 371 | :param message: The message string to log. 372 | :param extra: A dictionary containing extra data to attach to the log record. 373 | :param kwargs: Additional keyword arguments passed to logger.log(), such as exc_info=True. 374 | :return: None 375 | """ 376 | logger.log(level, message, extra=extra, **kwargs) 377 | -------------------------------------------------------------------------------- /doctor/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for doctor project. 3 | 4 | Generated by 'django-admin startproject' using Django 4.0.3. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/4.0/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | 15 | import environ 16 | import sentry_sdk 17 | from sentry_sdk.integrations.django import DjangoIntegration 18 | 19 | env = environ.FileAwareEnv() 20 | 21 | BASE_DIR = Path(__file__).resolve().parent.parent 22 | DEBUG = env.bool("DEBUG", default=False) 23 | SECRET_KEY = "this-is-a-not-so-secret-key" 24 | ALLOWED_HOSTS = ["doctor", "0.0.0.0", "localhost"] 25 | INSTALLED_APPS = [] 26 | ROOT_URLCONF = "doctor.urls" 27 | WSGI_APPLICATION = "doctor.wsgi.application" 28 | 29 | 30 | SENTRY_DSN = env("SENTRY_DSN", default="") 31 | if SENTRY_DSN: 32 | sentry_sdk.init( 33 | dsn=SENTRY_DSN, 34 | integrations=[ 35 | DjangoIntegration(), 36 | ], 37 | ignore_errors=[KeyboardInterrupt], 38 | ) 39 | -------------------------------------------------------------------------------- /doctor/tasks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import io 4 | import os 5 | import re 6 | import subprocess 7 | from collections.abc import ByteString 8 | from tempfile import NamedTemporaryFile 9 | from typing import Any, AnyStr 10 | 11 | import eyed3 12 | import magic 13 | import pdfplumber 14 | import requests 15 | import xray 16 | from eyed3 import id3 17 | from lxml.html.clean import Cleaner 18 | from PIL.Image import Image 19 | from PyPDF2 import PdfMerger, PdfReader 20 | from PyPDF2.errors import PdfReadError 21 | from seal_rookery.search import ImageSizes, seal 22 | 23 | from doctor.lib.mojibake import fix_mojibake 24 | from doctor.lib.text_extraction import ( 25 | extract_with_ocr, 26 | get_page_text, 27 | page_needs_ocr, 28 | remove_excess_whitespace, 29 | ) 30 | from doctor.lib.utils import ( 31 | DoctorUnicodeDecodeError, 32 | force_bytes, 33 | force_text, 34 | ocr_needed, 35 | smart_text, 36 | ) 37 | 38 | 39 | def strip_metadata_from_bytes(pdf_bytes): 40 | """Convert PDF bytes into PDF and remove metadata from it 41 | 42 | Stripping the metadata allows us to hash the PDFs 43 | 44 | :param pdf_bytes: PDF as binary content 45 | :return: PDF bytes with metadata removed. 46 | """ 47 | pdf_merger = PdfMerger() 48 | pdf_merger.append(io.BytesIO(pdf_bytes)) 49 | pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""}) 50 | byte_writer = io.BytesIO() 51 | pdf_merger.write(byte_writer) 52 | return force_bytes(byte_writer.getvalue()) 53 | 54 | 55 | def pdf_bytes_from_images(image_list: list[Image]): 56 | """Make a pdf given an array of Image files 57 | 58 | :param image_list: List of images 59 | :type image_list: list 60 | :return: PDF as bytes 61 | """ 62 | with io.BytesIO() as output: 63 | image_list[0].save( 64 | output, 65 | "PDF", 66 | resolution=100.0, 67 | save_all=True, 68 | append_images=image_list[1:], 69 | ) 70 | pdf_data = output.getvalue() 71 | 72 | return pdf_data 73 | 74 | 75 | def make_pdftotext_process(path): 76 | """Make a subprocess to hand to higher-level code. 77 | 78 | :param path: File location 79 | :return: Subprocess results 80 | """ 81 | 82 | process = subprocess.Popen( 83 | ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], 84 | shell=False, 85 | stdout=subprocess.PIPE, 86 | stderr=subprocess.DEVNULL, 87 | ) 88 | content, err = process.communicate() 89 | return content.decode(), err, process.returncode 90 | 91 | 92 | def rasterize_pdf(path, destination): 93 | """Convert the PDF into a multipage Tiff file. 94 | 95 | This function uses ghostscript for processing and borrows heavily from: 96 | 97 | https://github.com/jbarlow83/OCRmyPDF/blob/636d1903b35fed6b07a01af53769fea81f388b82/ocrmypdf/ghostscript.py#L11 98 | 99 | """ 100 | # gs docs, see: http://ghostscript.com/doc/7.07/Use.htm 101 | # gs devices, see: http://ghostscript.com/doc/current/Devices.htm 102 | # 103 | # Compression is a trade off. It takes twice as long to convert PDFs, but 104 | # they're about 1-2% the size of the uncompressed version. They take about 105 | # 30% of the RAM when Tesseract processes them. See: 106 | # https://github.com/tesseract-ocr/tesseract/issues/431#issuecomment-250549208 107 | # destination = "/tmp/tmppzo3zzah.tiff" 108 | # gs -dQUIET -dSAFER -dBATCH -dNOPAUSE -sDEVICE=tiffgray -sCompression=lzw -r300x300 -o 109 | gs = [ 110 | "gs", 111 | "-dQUIET", # Suppress printing routine info 112 | "-dSAFER", # Lock down the filesystem to only files on command line 113 | "-dBATCH", # Exit after finishing file. Don't wait for more commands. 114 | "-dNOPAUSE", # Don't pause after each page 115 | "-sDEVICE=tiffgray", 116 | "-sCompression=lzw", 117 | "-r300x300", # Set the resolution to 300 DPI. 118 | "-o", 119 | destination, 120 | path, 121 | ] 122 | 123 | p = subprocess.Popen( 124 | gs, 125 | close_fds=True, 126 | stdout=subprocess.PIPE, 127 | stderr=subprocess.PIPE, 128 | universal_newlines=True, 129 | ) 130 | stdout, stderr = p.communicate() 131 | return stdout, stderr, p.returncode 132 | 133 | 134 | def get_xray(path): 135 | """Get bad redactions 136 | 137 | :param path: A path to the file 138 | 139 | :return: dictionary of bounding boxes. 140 | """ 141 | try: 142 | bad_redactions = xray.inspect(path) 143 | return bad_redactions 144 | except ( 145 | OSError, 146 | ValueError, 147 | TypeError, 148 | KeyError, 149 | AssertionError, 150 | PdfReadError, 151 | ): 152 | return {"error": True, "msg": "Exception"} 153 | except Exception: 154 | return {"error": True, "msg": "Exception"} 155 | # not reached 156 | 157 | 158 | def get_page_count(path, extension): 159 | """Get the number of pages, if appropriate mimetype. 160 | 161 | :param path: A path to a binary (pdf, wpd, doc, txt, html, etc.) 162 | :param extension: The extension of the binary. 163 | :return: The number of pages if possible, else return None 164 | """ 165 | if extension == "pdf": 166 | try: 167 | reader = PdfReader(path) 168 | return len(reader.pages) 169 | except ( 170 | OSError, 171 | ValueError, 172 | TypeError, 173 | KeyError, 174 | AssertionError, 175 | PdfReadError, 176 | ): 177 | # IOError: File doesn't exist. My bad. 178 | # ValueError: Didn't get an int for the page count. Their bad. 179 | # TypeError: NumberObject has no attribute '__getitem__'. Ugh. 180 | # KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF? 181 | # PdfReadError: Something else. I have no words. 182 | return 0 183 | 184 | elif extension == "wpd": 185 | # Best solution appears to be to dig into the binary format 186 | pass 187 | elif extension == "doc": 188 | # Best solution appears to be to dig into the XML of the file 189 | # itself: http://stackoverflow.com/a/12972502/64911 190 | pass 191 | return None 192 | 193 | 194 | def extract_from_pdf( 195 | path: str, 196 | original_filename: str, 197 | ocr_available: bool = False, 198 | ) -> Any: 199 | """Extract text from pdfs. 200 | 201 | Start with pdftotext. If we we enabled OCR - and the the content is empty 202 | or the PDF contains images, use tesseract. This pattern occurs because PDFs 203 | can be images, text-based and a mix of the two. We check for images to 204 | make sure we do OCR on mix-type PDFs. 205 | 206 | If a text-based PDF we fix corrupt PDFs from ca9. 207 | 208 | :param path: The path to the PDF 209 | :param original_filename: The original file name of the PDF file. 210 | :param ocr_available: Whether we should do OCR stuff 211 | :return Tuple of the content itself and any errors we received 212 | """ 213 | content, err, returncode = make_pdftotext_process(path) 214 | extracted_by_ocr = False 215 | if err is not None: 216 | err = err.decode() 217 | 218 | if not ocr_available: 219 | if "e" not in content: 220 | # It's a corrupt PDF from ca9. Fix it. 221 | content = fix_mojibake(content) 222 | else: 223 | if ocr_needed(path, content): 224 | success, ocr_content = extract_by_ocr(path) 225 | if success: 226 | # Check content length and take the longer of the two 227 | if len(ocr_content) > len(content): 228 | content = ocr_content 229 | # opinion.extracted_by_ocr = True 230 | extracted_by_ocr = True 231 | elif content == "" or not success: 232 | content = "Unable to extract document content." 233 | 234 | return content, err, returncode, extracted_by_ocr 235 | 236 | 237 | def extract_by_ocr(path: str) -> (bool, str): 238 | """Extract the contents of a PDF using OCR.""" 239 | fail_msg = ( 240 | "Unable to extract the content from this file. Please try " 241 | "reading the original." 242 | ) 243 | with NamedTemporaryFile(prefix="ocr_", suffix=".tiff", buffering=0) as tmp: 244 | out, err, returncode = rasterize_pdf(path, tmp.name) 245 | if returncode != 0: 246 | return False, fail_msg 247 | 248 | txt = convert_file_to_txt(tmp.name) 249 | txt = cleanup_ocr_text(txt) 250 | 251 | return True, txt 252 | 253 | 254 | def cleanup_ocr_text(txt: str) -> str: 255 | """Do some basic cleanup to make OCR text better. 256 | 257 | Err on the side of safety. Don't make fixes that could cause other issues. 258 | 259 | :param txt: The txt output from the OCR engine. 260 | :return: Txt output, cleaned up. 261 | """ 262 | simple_replacements = ( 263 | ("Fi|ed", "Filed"), 264 | (" Il ", " II "), 265 | ) 266 | for replacement in simple_replacements: 267 | txt = txt.replace(replacement[0], replacement[1]) 268 | return txt 269 | 270 | 271 | def convert_file_to_txt(path: str) -> str: 272 | tesseract_command = [ 273 | "tesseract", 274 | path, 275 | "stdout", 276 | "-l", 277 | "eng", 278 | "-c", 279 | "tessedit_do_invert=0", # Assume a white background for speed 280 | ] 281 | p = subprocess.Popen( 282 | tesseract_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE 283 | ) 284 | return p.communicate()[0].decode() 285 | 286 | 287 | def convert_tiff_to_pdf_bytes(single_tiff_image: Image) -> ByteString: 288 | """Split long tiff into page sized image 289 | 290 | :param single_tiff_image: One long tiff file 291 | :return: PDF Bytes 292 | """ 293 | width, height = single_tiff_image.size 294 | image_list = [] 295 | i, page_width, page_height = 0, width, (1046 * (float(width) / 792)) 296 | while i < (height / page_height): 297 | single_page = single_tiff_image.crop( 298 | (0, (i * page_height), page_width, (i + 1) * page_height) 299 | ) 300 | image_list.append(single_page) 301 | i += 1 302 | 303 | pdf_bytes = pdf_bytes_from_images(image_list) 304 | return pdf_bytes 305 | 306 | 307 | def extract_from_doc(path): 308 | """Extract text from docs. 309 | 310 | We use antiword to pull the text out of MS Doc files. 311 | """ 312 | process = subprocess.Popen( 313 | ["antiword", path, "-i", "1"], 314 | shell=False, 315 | stdout=subprocess.PIPE, 316 | stderr=subprocess.DEVNULL, 317 | ) 318 | content, err = process.communicate() 319 | return content.decode("utf-8"), err, process.returncode 320 | 321 | 322 | def extract_from_docx(path): 323 | """Extract text from docx files 324 | 325 | We use docx2txt to pull out the text. Pretty simple. 326 | """ 327 | process = subprocess.Popen( 328 | ["docx2txt", path, "-"], 329 | shell=False, 330 | stdout=subprocess.PIPE, 331 | stderr=subprocess.DEVNULL, 332 | ) 333 | content, err = process.communicate() 334 | return content.decode("utf-8"), err, process.returncode 335 | 336 | 337 | def extract_from_html(path: str) -> tuple[str, str, int]: 338 | """Extract from html file by attempting various encodings 339 | 340 | A simple wrapper to go get content, and send it along. 341 | 342 | :param path: The file path to the HTML file. 343 | :return: A tuple containing: 344 | - The extracted and cleaned text content (str), or an empty string on failure. 345 | - An error message (str), or an empty string on success. 346 | - A return code (int), typically 0 on success, 1 on failure. 347 | """ 348 | for encoding in ["utf-8", "ISO8859", "cp1252", "latin-1"]: 349 | try: 350 | with open(path, encoding=encoding) as f: 351 | content = f.read() 352 | content = get_clean_body_content(content) 353 | content = force_text(content, encoding=encoding) 354 | return content, "", 0 355 | except (UnicodeDecodeError, DoctorUnicodeDecodeError): 356 | pass 357 | # Fell through, therefore unable to decode the string. 358 | return "", "Could not encode content properly", 1 359 | 360 | 361 | def get_clean_body_content(content: str) -> str: 362 | """Parse out the body from an html string, clean it up, and send it along. 363 | 364 | :param content: The HTML content as a string 365 | :return: The cleaned HTML body content as a string, or a default error string on failure 366 | """ 367 | cleaner = Cleaner( 368 | style=True, remove_tags=["a", "body", "font", "noscript", "img"] 369 | ) 370 | return cleaner.clean_html(content) 371 | 372 | 373 | def extract_from_txt(filepath): 374 | """Extract text from plain text files: A fool's errand. 375 | 376 | Unfortunately, plain text files lack encoding information, so we have to 377 | guess. We could guess ascii, but we may as well use a superset of ascii, 378 | cp1252, and failing that try utf-8, ignoring errors. Most txt files we 379 | encounter were produced by converting wpd or doc files to txt on a 380 | Microsoft box, so assuming cp1252 as our first guess makes sense. 381 | 382 | May we hope for a better world. 383 | """ 384 | err = None 385 | error_code = 0 386 | try: 387 | with open(filepath) as f: 388 | data = f.read() 389 | try: 390 | # Alas, cp1252 is probably still more popular than utf-8. 391 | content = smart_text(data, encoding="cp1252") 392 | except DoctorUnicodeDecodeError: 393 | content = smart_text(data, encoding="utf-8", errors="ignore") 394 | except Exception: 395 | try: 396 | with open(filepath, "rb") as f: 397 | blob = f.read() 398 | m = magic.Magic(mime_encoding=True) 399 | encoding = m.from_buffer(blob) 400 | with open(filepath, encoding=encoding) as f: 401 | data = f.read() 402 | content = smart_text(data, encoding=encoding, errors="ignore") 403 | except Exception: 404 | err = "An error occurred extracting txt file." 405 | content = "" 406 | error_code = 1 407 | return content, err, error_code 408 | 409 | 410 | def extract_from_wpd(path: str) -> tuple[str, bytes, int]: 411 | """Extract text from a Word Perfect file 412 | 413 | Yes, courts still use these, so we extract their text using wpd2html. Once 414 | that's done, we pull out the body of the HTML, and do some minor cleanup 415 | on it. 416 | 417 | :param path: The file path to the Word Perfect (.wpd) file. 418 | :return: A tuple containing: 419 | - The extracted and cleaned text content (str) 420 | - The standard error output from the wpd2html subprocess (bytes) 421 | - The return code of the wpd2html subprocess (int). Returns 1 on Python-level errors 422 | """ 423 | process = subprocess.Popen( 424 | ["wpd2html", path], 425 | shell=False, 426 | stdout=subprocess.PIPE, 427 | stderr=subprocess.DEVNULL, 428 | ) 429 | content_bytes, err = process.communicate() 430 | content_str = content_bytes.decode("utf-8") 431 | content = get_clean_body_content(content_str) 432 | 433 | return content, err, process.returncode 434 | 435 | 436 | def download_images(sorted_urls) -> list: 437 | """Download images and convert to list of PIL images 438 | 439 | Once in an array of PIL.images we can easily convert this to a PDF. 440 | 441 | :param sorted_urls: List of sorted URLs for split financial disclosure 442 | :return: image_list 443 | """ 444 | 445 | async def main(urls): 446 | image_list = [] 447 | loop = asyncio.get_event_loop() 448 | futures = [ 449 | loop.run_in_executor(None, requests.get, url) for url in urls 450 | ] 451 | for response in await asyncio.gather(*futures): 452 | image_list.append(response.content) 453 | return image_list 454 | 455 | loop = asyncio.get_event_loop() 456 | image_list = loop.run_until_complete(main(sorted_urls)) 457 | 458 | return image_list 459 | 460 | 461 | # Audio 462 | 463 | root = os.path.dirname(os.path.realpath(__file__)) 464 | assets_dir = os.path.join(root, "assets") 465 | 466 | 467 | def convert_to_mp3(output_path: AnyStr, media: Any) -> None: 468 | """Convert audio bytes to mp3 at temporary path 469 | 470 | :param output_path: Audio file bytes sent to Doctor 471 | :param media: Temporary filepath for output of audioprocess 472 | :return: 473 | """ 474 | av_command = [ 475 | "ffmpeg", 476 | "-i", 477 | "/dev/stdin", 478 | "-ar", 479 | "22050", 480 | "-ab", 481 | "48k", 482 | "-f", 483 | "mp3", 484 | output_path, 485 | ] 486 | 487 | ffmpeg_cmd = subprocess.Popen( 488 | av_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False 489 | ) 490 | ffmpeg_cmd.communicate(media.read()) 491 | return output_path 492 | 493 | 494 | def convert_to_ogg(output_path: AnyStr, media: Any) -> None: 495 | """Converts audio data to the ogg format (.ogg) 496 | 497 | This function uses ffmpeg to convert the audio data provided in `media` to 498 | the ogg format with the following specifications: 499 | 500 | * Single audio channel (`-ac 1`) 501 | * 8 kHz sampling rate (`-b:a 8k`) 502 | * Optimized for voice over IP applications (`-application voip`) 503 | 504 | :param output_path: Audio file bytes sent to Doctor 505 | :param media: Temporary filepath for output of audioprocess 506 | :return: 507 | """ 508 | av_command = [ 509 | "ffmpeg", 510 | "-i", 511 | "/dev/stdin", 512 | "-vn", 513 | "-map_metadata", 514 | "-1", 515 | "-ac", 516 | "1", 517 | "-c:a", 518 | "libopus", 519 | "-b:a", 520 | "8k", 521 | "-application", 522 | "voip", 523 | "-f", 524 | "ogg", 525 | output_path, 526 | ] 527 | 528 | ffmpeg_cmd = subprocess.Popen( 529 | av_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False 530 | ) 531 | ffmpeg_cmd.communicate(media.read()) 532 | return output_path 533 | 534 | 535 | def set_mp3_meta_data( 536 | audio_data: dict, mp3_path: AnyStr 537 | ) -> eyed3.core.AudioFile: 538 | """Set the metadata in audio_data to an mp3 at path. 539 | 540 | :param audio_data: The new metadata to embed in the mp3. 541 | :param mp3_path: The path to the mp3 to be converted. 542 | :return: Eyed3 audio file object 543 | """ 544 | 545 | # Load the file, delete the old tags and create a new one. 546 | audio_file = eyed3.load(mp3_path) 547 | # Undocumented API from eyed3.plugins.classic.ClassicPlugin#handleRemoves 548 | id3.Tag.remove( 549 | audio_file.tag.file_info.name, 550 | id3.ID3_ANY_VERSION, 551 | preserve_file_time=False, 552 | ) 553 | audio_file.initTag() 554 | audio_file.tag.title = best_case_name(audio_data) 555 | date_argued = audio_data["date_argued"] 556 | docket_number = audio_data["docket_number"] 557 | audio_file.tag.album = ( 558 | f"{audio_data['court_full_name']}, {audio_data['date_argued_year']}" 559 | ) 560 | audio_file.tag.artist = audio_data["court_full_name"] 561 | audio_file.tag.artist_url = audio_data["court_url"] 562 | audio_file.tag.audio_source_url = audio_data["download_url"] 563 | 564 | audio_file.tag.comments.set( 565 | f"Argued: {date_argued}. Docket number: {docket_number}" 566 | ) 567 | audio_file.tag.genre = "Speech" 568 | audio_file.tag.publisher = "Free Law Project" 569 | audio_file.tag.publisher_url = "https://free.law" 570 | audio_file.tag.recording_date = date_argued 571 | 572 | # Add images to the mp3. If it has a seal, use that for the Front Cover 573 | # and use the FLP logo for the Publisher Logo. If it lacks a seal, use the 574 | # Publisher logo for both the front cover and the Publisher logo. 575 | url = seal(court=audio_data["court_pk"], size=ImageSizes.MEDIUM) 576 | 577 | flp_image_frames = [ 578 | 3, # "Front Cover". Complete list at eyed3/id3/frames.py 579 | 14, # "Publisher logo". 580 | ] 581 | 582 | if url: 583 | seal_content = requests.get(url, timeout=30).content 584 | audio_file.tag.images.set( 585 | 3, 586 | seal_content, 587 | "image/png", 588 | f"Seal for {audio_data['court_short_name']}", 589 | ) 590 | flp_image_frames.remove(3) 591 | 592 | for frame in flp_image_frames: 593 | cover_art_fp = os.path.join(assets_dir, "producer-300x300.png") 594 | with open(cover_art_fp, "rb") as cover_art: 595 | audio_file.tag.images.set( 596 | frame, 597 | cover_art.read(), 598 | "image/png", 599 | "Created for the public domain by Free Law Project", 600 | ) 601 | 602 | audio_file.tag.save() 603 | return audio_file 604 | 605 | 606 | def convert_to_base64(tmp_path: AnyStr) -> AnyStr: 607 | """Convert file base64 and decode it. 608 | 609 | This allows us to safely return the file in json to CL. 610 | 611 | :param tmp_path: 612 | :return: Audio file encoded in base64 as a string 613 | """ 614 | with open(tmp_path, "rb") as f: 615 | return base64.b64encode(f.read()).decode() 616 | 617 | 618 | def best_case_name(audio_dict: dict) -> AnyStr: 619 | """Take an object and return the highest quality case name possible. 620 | 621 | In general, this means returning the fields in an order like: 622 | 623 | - case_name 624 | - case_name_full 625 | - case_name_short 626 | 627 | Assumes that the object passed in has all of those attributes. 628 | """ 629 | if audio_dict.get("case_name"): 630 | return audio_dict.get("case_name") 631 | elif audio_dict.get("case_name_full"): 632 | return audio_dict["case_name_full"] 633 | else: 634 | return audio_dict.get("case_name_short", "") 635 | 636 | 637 | def get_header_stamp(obj: dict) -> bool: 638 | """pdfplumber filter to extract the PDF header stamp. 639 | 640 | :param obj: The page object to evaluate. 641 | :return: True if the found it, otherwise False. 642 | """ 643 | 644 | # This option works for most juridictions except for ca5 645 | if "LiberationSans" in obj.get("fontname", ""): 646 | return True 647 | # Exception for ca5 648 | return obj["y0"] > 750 649 | 650 | 651 | def clean_document_number(document_number: str) -> str: 652 | """Removes #, leading and ending whitespaces from the document number. 653 | 654 | :param document_number: The document number to clean 655 | :return: The cleaned document number. 656 | """ 657 | document_number = document_number.strip() 658 | document_number = document_number.replace("#", "") 659 | return document_number 660 | 661 | 662 | def get_document_number_from_pdf(path: str) -> str: 663 | """Get PACER document number from PDF. 664 | 665 | :param path: The path to the PDF 666 | :return: The PACER document number. 667 | """ 668 | 669 | with pdfplumber.open(path) as f: 670 | header_stamp = f.pages[0].filter(get_header_stamp).extract_text() 671 | 672 | # regex options to extract the document number 673 | regex = r"Document:(.[0-9.\-.\#]+)|Document(.[0-9.\-.\#]+)|Doc:(.[0-9.\-.\#]+)|DktEntry:(.[0-9.\-.\#]+)" 674 | document_number_matches = re.findall(regex, header_stamp) 675 | 676 | # If not matches return a empty string. 677 | if not document_number_matches: 678 | return "" 679 | document_number = [dn for dn in document_number_matches[0] if dn] 680 | return clean_document_number(document_number[0]) 681 | 682 | 683 | def extract_recap_pdf( 684 | filepath: str, 685 | strip_margin: bool = False, 686 | ) -> tuple[str, bool]: 687 | """Extract from RECAP PDF 688 | 689 | :param filepath: The path to the PDF 690 | :param strip_margin: Whether to remove 1 inch margin from text extraction 691 | :return: A tuple containing the text and a boolean indicating ocr usage 692 | """ 693 | content = "" 694 | extracted_by_ocr = False 695 | with pdfplumber.open(filepath) as pdf: 696 | for page in pdf.pages: 697 | page_text = get_page_text(page, strip_margin=strip_margin) 698 | if page_needs_ocr(page, page_text): 699 | extracted_by_ocr = True 700 | page_text = extract_with_ocr(page, strip_margin=strip_margin) 701 | content += f"\n{page_text}" 702 | content = remove_excess_whitespace(content) 703 | return content, extracted_by_ocr 704 | -------------------------------------------------------------------------------- /doctor/test_assets/1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1.mp3 -------------------------------------------------------------------------------- /doctor/test_assets/1.wma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1.wma -------------------------------------------------------------------------------- /doctor/test_assets/1_with_metadata.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1_with_metadata.mp3 -------------------------------------------------------------------------------- /doctor/test_assets/ander_v._leo.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/ander_v._leo.mp3 -------------------------------------------------------------------------------- /doctor/test_assets/broken-mime.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/broken-mime.pdf -------------------------------------------------------------------------------- /doctor/test_assets/empty.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/empty.pdf -------------------------------------------------------------------------------- /doctor/test_assets/image-pdf-2-thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-2-thumbnail.png -------------------------------------------------------------------------------- /doctor/test_assets/image-pdf-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-2.pdf -------------------------------------------------------------------------------- /doctor/test_assets/image-pdf-thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-thumbnail.png -------------------------------------------------------------------------------- /doctor/test_assets/image-pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf.pdf -------------------------------------------------------------------------------- /doctor/test_assets/long-image.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/long-image.tiff -------------------------------------------------------------------------------- /doctor/test_assets/missouri.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/missouri.pdf -------------------------------------------------------------------------------- /doctor/test_assets/ocr_pdf_variation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/ocr_pdf_variation.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca10_010110462922.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca10_010110462922.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca1_00117684624.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca1_00117684624.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca2_1-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca2_1-1.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca3_003112692106.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca3_003112692106.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca4_17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca4_17.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca5_00516242060.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca5_00516242060.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca6_1-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca6_1-3.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca7_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca7_3.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca8_.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca8_.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/ca9_19.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca9_19.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_documents/cafc_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/cafc_3.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf -------------------------------------------------------------------------------- /doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf -------------------------------------------------------------------------------- /doctor/test_assets/vector-pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/vector-pdf.pdf -------------------------------------------------------------------------------- /doctor/test_assets/word-doc.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-doc.doc -------------------------------------------------------------------------------- /doctor/test_assets/word-docx.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-docx.docx -------------------------------------------------------------------------------- /doctor/test_assets/word-perfect.wpd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-perfect.wpd -------------------------------------------------------------------------------- /doctor/test_assets/x-ray/rectangles_no.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_no.pdf -------------------------------------------------------------------------------- /doctor/test_assets/x-ray/rectangles_yes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_yes.pdf -------------------------------------------------------------------------------- /doctor/test_assets/x-ray/rectangles_yes_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_yes_2.pdf -------------------------------------------------------------------------------- /doctor/tests.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import re 5 | import unittest 6 | from pathlib import Path 7 | from tempfile import NamedTemporaryFile 8 | from unittest.mock import patch 9 | from zipfile import ZipFile 10 | 11 | import eyed3 12 | import requests 13 | 14 | from doctor.lib.text_extraction import ( 15 | adjust_caption_lines, 16 | cleanup_content, 17 | get_word, 18 | insert_whitespace, 19 | remove_excess_whitespace, 20 | ) 21 | from doctor.lib.utils import make_buffer, make_file 22 | 23 | asset_path = f"{Path.cwd()}/doctor/test_assets" 24 | 25 | 26 | class HeartbeatTests(unittest.TestCase): 27 | def test_heartbeat(self): 28 | """Can we curl the heartbeat endpoint?""" 29 | response = requests.get("http://doctor:5050/") 30 | self.assertEqual( 31 | response.text, "Heartbeat detected.", msg="Heartbeat failed" 32 | ) 33 | 34 | 35 | class RECAPExtractionTests(unittest.TestCase): 36 | def test_recap_extraction(self): 37 | """Can we extract from the new recap text endpoint""" 38 | files = make_file( 39 | filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf" 40 | ) 41 | params = {"strip_margin": False} 42 | response = requests.post( 43 | "http://doctor:5050/extract/recap/text/", 44 | files=files, 45 | params=params, 46 | ) 47 | first_line = response.json()["content"].splitlines()[0].strip() 48 | self.assertEqual(200, response.status_code, msg="Wrong status code") 49 | self.assertTrue( 50 | response.json()["extracted_by_ocr"], msg="Not extracted correctly" 51 | ) 52 | self.assertEqual( 53 | "aséakOS- 08-0220 A25BA BAD Gooonene 2627 Filed! OL/2B/DE0IP ageahefi2of 2", 54 | first_line, 55 | msg="Wrong Text", 56 | ) 57 | 58 | def test_recap_extraction_with_strip_margin(self): 59 | """Can we extract from the new recap text endpoint with strip margin?""" 60 | files = make_file( 61 | filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf" 62 | ) 63 | params = {"strip_margin": True} 64 | response = requests.post( 65 | "http://doctor:5050/extract/recap/text/", 66 | files=files, 67 | params=params, 68 | ) 69 | first_line = response.json()["content"].splitlines()[0].strip() 70 | self.assertEqual(200, response.status_code, msg="Wrong status code") 71 | self.assertEqual( 72 | "1 || DONALD W. CARLSON [Bar No. 79258]", 73 | first_line, 74 | msg="Wrong Text", 75 | ) 76 | 77 | def test_recap_strip_marign_with_multiple_shaped_pdfs(self): 78 | """Can we extract atypical shape pdf with strip margin?""" 79 | 80 | files = make_file( 81 | filename="recap_extract/gov.uscourts.azd.1085839.3.0.pdf" 82 | ) 83 | params = {"strip_margin": True} 84 | response = requests.post( 85 | "http://doctor:5050/extract/recap/text/", 86 | files=files, 87 | params=params, 88 | ) 89 | first_line = response.json()["content"].splitlines()[0].strip() 90 | self.assertEqual(200, response.status_code, msg="Wrong status code") 91 | self.assertEqual( 92 | "1 WO", 93 | first_line, 94 | msg="Wrong Text", 95 | ) 96 | 97 | def test_strip_margin_without_ocr(self): 98 | """Can we extract from the new recap text endpoint with strip margin?""" 99 | files = make_file( 100 | filename="recap_extract/gov.uscourts.cacd.652774.40.0.pdf" 101 | ) 102 | params = {"strip_margin": True} 103 | response = requests.post( 104 | "http://doctor:5050/extract/recap/text/", 105 | files=files, 106 | params=params, 107 | ) 108 | first_line = response.json()["content"].splitlines()[0].strip() 109 | self.assertEqual(200, response.status_code, msg="Wrong status code") 110 | self.assertEqual("1", first_line, msg="Wrong Text") 111 | 112 | 113 | class ExtractionTests(unittest.TestCase): 114 | def test_pdf_to_text(self): 115 | """""" 116 | files = make_file(filename="vector-pdf.pdf") 117 | data = {"ocr_available": True} 118 | response = requests.post( 119 | "http://doctor:5050/extract/doc/text/", files=files, data=data 120 | ) 121 | text = response.json()["content"][:100].replace("\n", "").strip() 122 | self.assertEqual(200, response.status_code, msg="Wrong status code") 123 | self.assertEqual( 124 | text, 125 | "(Slip Opinion) OCTOBER TERM, 2012 1", 126 | msg=text, 127 | ) 128 | 129 | def test_content_extraction(self): 130 | """""" 131 | files = make_file(filename="vector-pdf.pdf") 132 | data = {"ocr_available": False} 133 | response = requests.post( 134 | "http://doctor:5050/extract/doc/text/", files=files, data=data 135 | ) 136 | self.assertTrue(response.ok, msg="Content extraction failed") 137 | self.assertEqual( 138 | response.json()["content"][:100].replace("\n", "").strip(), 139 | "(Slip Opinion) OCTOBER TERM, 2012 1", 140 | msg="Failed to extract content from .pdf file", 141 | ) 142 | self.assertFalse( 143 | response.json()["extracted_by_ocr"], 144 | msg="Failed to extract by OCR", 145 | ) 146 | self.assertEqual( 147 | response.json()["page_count"], 148 | 30, 149 | msg="Failed to extract by OCR", 150 | ) 151 | 152 | def test_pdf_ocr_extraction(self): 153 | files = make_file(filename="image-pdf.pdf") 154 | params = {"ocr_available": True} 155 | response = requests.post( 156 | "http://doctor:5050/extract/doc/text/", 157 | files=files, 158 | params=params, 159 | ) 160 | self.assertTrue(response.ok, msg="Content extraction failed") 161 | content = response.json()["content"][:100].replace("\n", "").strip() 162 | self.assertEqual( 163 | content, 164 | "(Slip Opinion) OCTOBER TERM, 2012 1SyllabusNOTE: Where it is feasible, a syllabus (headnote) wil", 165 | msg="Failed to extract content from image .pdf file", 166 | ) 167 | self.assertTrue( 168 | response.json()["extracted_by_ocr"], 169 | msg="Failed to extract by OCR", 170 | ) 171 | 172 | def test_pdf_v2_ocr_extraction(self): 173 | files = make_file(filename="ocr_pdf_variation.pdf") 174 | params = {"ocr_available": True} 175 | response = requests.post( 176 | "http://doctor:5050/extract/doc/text/", 177 | files=files, 178 | params=params, 179 | ) 180 | self.assertTrue(response.ok, msg="Content extraction failed") 181 | content = response.json()["content"][:100].replace("\n", "").strip() 182 | self.assertIn( 183 | "UNITED", 184 | content, 185 | msg="Failed to extract content from ocr_pdf_variation .pdf file", 186 | ) 187 | self.assertTrue( 188 | response.json()["extracted_by_ocr"], 189 | msg="Failed to extract by OCR", 190 | ) 191 | 192 | def test_docx_format(self): 193 | files = make_file(filename="word-docx.docx") 194 | params = {"ocr_available": False} 195 | response = requests.post( 196 | "http://doctor:5050/extract/doc/text/", 197 | files=files, 198 | params=params, 199 | ) 200 | self.assertTrue(response.ok, msg="Content extraction failed") 201 | self.assertEqual( 202 | response.json()["content"][:200].replace("\n", "").strip(), 203 | "ex- Cpl, Current Discharge and Applicant's RequestApplication R", 204 | msg="Failed to extract content from .docx file", 205 | ) 206 | 207 | def test_doc_format(self): 208 | files = make_file(filename="word-doc.doc") 209 | data = {"ocr_available": False} 210 | response = requests.post( 211 | "http://doctor:5050/extract/doc/text/", files=files, data=data 212 | ) 213 | self.assertTrue(response.ok, msg="Content extraction failed") 214 | content = response.json()["content"][:100].replace("\n", "").strip() 215 | self.assertEqual( 216 | content, 217 | "Attorneys for Appellant Attorneys for AppelleeSteve Carter", 218 | msg="Failed to extract content from .doc file", 219 | ) 220 | self.assertEqual( 221 | response.json()["page_count"], 222 | None, 223 | msg="Failed to extract by OCR", 224 | ) 225 | 226 | def test_wpd_format(self): 227 | files = make_file(filename="word-perfect.wpd") 228 | data = {"ocr_available": False} 229 | response = requests.post( 230 | "http://doctor:5050/extract/doc/text/", files=files, data=data 231 | ) 232 | self.assertTrue(response.ok, msg="Content extraction failed") 233 | self.assertIn( 234 | "ATTORNEY FOR APPELLANT", 235 | response.json()["content"], 236 | msg="Failed to extract content from WPD file", 237 | ) 238 | self.assertEqual( 239 | 14259, 240 | len(response.json()["content"]), 241 | msg="Failed to extract content from WPD file", 242 | ) 243 | 244 | 245 | class ThumbnailTests(unittest.TestCase): 246 | """Can we generate thumbnail images from PDF files""" 247 | 248 | def test_convert_pdf_to_thumbnail_png(self): 249 | """Can we generate four thumbanils a pdf?""" 250 | files = make_file(filename="image-pdf.pdf") 251 | data = {"max_dimension": 350} 252 | response = requests.post( 253 | "http://doctor:5050/convert/pdf/thumbnail/", 254 | files=files, 255 | data=data, 256 | ) 257 | with open("doctor/test_assets/image-pdf-thumbnail.png", "rb") as f: 258 | answer = f.read() 259 | self.assertEqual(answer, response.content) 260 | 261 | files = make_file(filename="image-pdf-2.pdf") 262 | response = requests.post( 263 | "http://doctor:5050/convert/pdf/thumbnail/", files=files 264 | ) 265 | with open("doctor/test_assets/image-pdf-2-thumbnail.png", "rb") as f: 266 | second_answer = f.read() 267 | self.assertEqual(second_answer, response.content) 268 | 269 | files = make_file(filename="empty.pdf") 270 | response = requests.post( 271 | "http://doctor:5050/convert/pdf/thumbnail/", files=files 272 | ) 273 | self.assertEqual(response.status_code, 400, msg="Wrong status code") 274 | 275 | def test_thumbnail_range(self): 276 | """Can we generate a thumbnail for a range of pages?""" 277 | files = make_file(filename="vector-pdf.pdf") 278 | pages = [1, 2, 3, 4] 279 | data = { 280 | "max_dimension": 350, 281 | "pages": json.dumps(pages), 282 | } 283 | 284 | response = requests.post( 285 | "http://doctor:5050/convert/pdf/thumbnails/", 286 | files=files, 287 | data=data, 288 | ) 289 | with NamedTemporaryFile(suffix=".zip") as tmp: 290 | with open(tmp.name, "wb") as f: 291 | f.write(response.content) 292 | with ZipFile(tmp.name, "r") as zipObj: 293 | listOfiles = sorted(zipObj.namelist()) 294 | self.assertEqual(len(listOfiles), 4) 295 | self.assertEqual( 296 | ["thumb-1.png", "thumb-2.png", "thumb-3.png", "thumb-4.png"], 297 | listOfiles, 298 | ) 299 | 300 | 301 | class MetadataTests(unittest.TestCase): 302 | """Can we count page numbers in PDF files""" 303 | 304 | def test_page_count_pdf(self): 305 | """""" 306 | files = make_file(filename="image-pdf.pdf") 307 | page_count = requests.post( 308 | "http://doctor:5050/utils/page-count/pdf/", files=files 309 | ).text 310 | self.assertEqual(int(page_count), 2, "Failed to get page count") 311 | 312 | def test_mime_type(self): 313 | """""" 314 | files = make_file(filename="image-pdf.pdf") 315 | params = {"mime": True} 316 | response = requests.post( 317 | "http://doctor:5050/utils/mime-type/", 318 | files=files, 319 | params=params, 320 | ).json() 321 | self.assertEqual( 322 | response["mimetype"], 323 | "application/pdf", 324 | msg="Failed to get mime type", 325 | ) 326 | 327 | def test_broken_mime_type(self): 328 | """""" 329 | files = make_buffer(filename="broken-mime.pdf") 330 | params = {"mime": True} 331 | response = requests.post( 332 | "http://doctor:5050/utils/file/extension/", 333 | files=files, 334 | params=params, 335 | ) 336 | self.assertEqual(response.text, ".pdf", msg="Failed to get mime type") 337 | 338 | files = make_buffer(filename="missouri.pdf") 339 | params = {"mime": True} 340 | response = requests.post( 341 | "http://doctor:5050/utils/file/extension/", 342 | files=files, 343 | params=params, 344 | ) 345 | self.assertEqual(response.text, ".pdf", msg="Failed to get mime type") 346 | 347 | def test_mime_type_unknown_name(self): 348 | """""" 349 | files = make_buffer(filename="image-pdf.pdf") 350 | response = requests.post( 351 | "http://doctor:5050/utils/mime-type/", 352 | files=files, 353 | params={"mime": True}, 354 | ).json() 355 | self.assertEqual( 356 | response["mimetype"], 357 | "application/pdf", 358 | msg="Failed to get mime type", 359 | ) 360 | 361 | def test_get_extension(self): 362 | """""" 363 | files = make_buffer(filename="image-pdf.pdf") 364 | response = requests.post( 365 | "http://doctor:5050/utils/file/extension/", files=files 366 | ) 367 | self.assertEqual(response.text, ".pdf", msg="Failed to get mime type") 368 | 369 | files = make_buffer(filename="word-docx.docx") 370 | response = requests.post( 371 | "http://doctor:5050/utils/file/extension/", files=files 372 | ) 373 | self.assertEqual(response.text, ".docx", msg="Failed to get mime type") 374 | files = make_buffer(filename="word-doc.doc") 375 | response = requests.post( 376 | "http://doctor:5050/utils/file/extension/", files=files 377 | ) 378 | self.assertEqual(response.text, ".doc", msg="Failed to get mime type") 379 | 380 | def test_embedding_text_to_image_pdf(self): 381 | """Can we embed text into an image PDF?""" 382 | data = {"ocr_available": False} 383 | 384 | files = make_file(filename="image-pdf.pdf") 385 | image_response = requests.post( 386 | "http://doctor:5050/extract/doc/text/", files=files, data=data 387 | ) 388 | self.assertEqual( 389 | "", 390 | image_response.json()["content"].strip("\x0c"), 391 | msg="PDF should have no text", 392 | ) 393 | 394 | # Embed text into the image pdf and check that we get some text 395 | new_pdf = requests.post( 396 | "http://doctor:5050/utils/add/text/pdf/", files=files 397 | ) 398 | with NamedTemporaryFile(suffix=".pdf") as tmp: 399 | with open(tmp.name, "wb") as f: 400 | f.write(new_pdf.content) 401 | with open(tmp.name, "rb") as f: 402 | files = {"file": (tmp.name, f.read())} 403 | 404 | # Confirm that text is now embedded in the PDF 405 | response = requests.post( 406 | "http://doctor:5050/extract/doc/text/", 407 | files=files, 408 | data=data, 409 | ) 410 | self.assertIn( 411 | "(SlipOpinion) OCTOBER TERM, 2012", 412 | response.json()["content"], 413 | msg=f"Got {response.json()}", 414 | ) 415 | 416 | def test_get_document_number(self): 417 | """Check if the PACER document number is correctly extracted from 418 | documents from multiple jurisdictions. 419 | """ 420 | 421 | filepath = f"{Path.cwd()}/doctor/test_assets/recap_documents/" 422 | for file in glob.glob(os.path.join(filepath, "*.pdf")): 423 | filename = os.path.relpath(file, filepath) 424 | filename_sans_ext = filename.split(".")[0] 425 | doc_num = filename_sans_ext.split("_")[1] 426 | 427 | with open(file, "rb") as f: 428 | files = {"file": (filename, f.read())} 429 | 430 | document_number = requests.post( 431 | "http://doctor:5050/utils/document-number/pdf/", 432 | files=files, 433 | ).text 434 | 435 | self.assertEqual(doc_num, document_number) 436 | 437 | 438 | class RedactionTest(unittest.TestCase): 439 | def test_xray_no_pdf(self): 440 | """Are we able to discover bad redacts?""" 441 | filepath = f"{Path.cwd()}/doctor/test_assets/x-ray/" 442 | test_files = ( 443 | "*yes*.pdf", 444 | "*no*.pdf", 445 | ) 446 | for pattern in test_files: 447 | direction = re.search("yes", pattern) 448 | for file in glob.glob(os.path.join(filepath, pattern)): 449 | filename = os.path.relpath(file, filepath) 450 | 451 | with open(file, "rb") as f: 452 | files = {"file": (filename, f.read())} 453 | response = requests.post( 454 | "http://doctor:5050/utils/check-redactions/pdf/", 455 | files=files, 456 | ) 457 | # Break up the assertion so that testers can see which 458 | # part is actually failing 459 | self.assertTrue(response.ok) 460 | bb = response.json() 461 | self.assertFalse(bb["error"]) 462 | if not direction: 463 | self.assertTrue(len(bb["results"]) == 0) 464 | else: 465 | self.assertFalse(len(bb["results"]) == 0) 466 | 467 | 468 | class ImageDisclosuresTest(unittest.TestCase): 469 | def test_images_to_pdf(self): 470 | """Do we create a PDF from several tiffs successfully?""" 471 | base = "https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/financial-disclosures/2011/A-E/Armstrong-SB%20J3.%2009.%20CAN_R_11/Armstrong-SB%20J3.%2009.%20CAN_R_11_Page" 472 | sorted_urls = [ 473 | f"{base}_1.tiff", 474 | f"{base}_2.tiff", 475 | ] 476 | params = {"sorted_urls": json.dumps(sorted_urls)} 477 | response = requests.post( 478 | "http://doctor:5050/convert/images/pdf/", 479 | params=params, 480 | ) 481 | self.assertEqual(response.status_code, 200, msg="Failed status code.") 482 | self.assertEqual( 483 | b"%PDF-1.3\n", 484 | response.content[:9], 485 | msg="PDF generation failed", 486 | ) 487 | 488 | 489 | class AudioConversionTests(unittest.TestCase): 490 | """Test Audio Conversion""" 491 | 492 | def test_wma_to_mp3(self): 493 | """Can we convert to mp3 with metadata""" 494 | 495 | audio_details = { 496 | "court_full_name": "Testing Supreme Court", 497 | "court_short_name": "Testing Supreme Court", 498 | "court_pk": "mad", 499 | "court_url": "http://www.example.com/", 500 | "docket_number": "docket number 1 005", 501 | "date_argued": "2020-01-01", 502 | "date_argued_year": "2020", 503 | "case_name": "SEC v. Frank J. Custable, Jr.", 504 | "case_name_full": "case name full", 505 | "case_name_short": "short", 506 | "download_url": "http://media.ca7.uscourts.gov/sound/external/gw.15-1442.15-1442_07_08_2015.mp3", 507 | } 508 | 509 | files = make_file(filename="1.wma") 510 | response = requests.post( 511 | "http://doctor:5050/convert/audio/mp3/", 512 | files=files, 513 | params=audio_details, 514 | ) 515 | self.assertEqual(response.status_code, 200, msg="Bad status code") 516 | 517 | # Validate some metadata in the MP3. 518 | with NamedTemporaryFile(suffix=".mp3") as tmp: 519 | with open(tmp.name, "wb") as mp3_data: 520 | mp3_data.write(response.content) 521 | mp3_file = eyed3.load(tmp.name) 522 | 523 | self.assertEqual( 524 | mp3_file.tag.publisher, 525 | "Free Law Project", 526 | msg="Publisher metadata failed.", 527 | ) 528 | self.assertEqual( 529 | mp3_file.tag.title, 530 | "SEC v. Frank J. Custable, Jr.", 531 | msg="Title metadata failed.", 532 | ) 533 | self.assertEqual( 534 | mp3_file.type, 535 | eyed3.core.AUDIO_MP3, 536 | msg="Audio conversion to mp3 failed.", 537 | ) 538 | 539 | def test_audio_duration(self): 540 | files = make_file(filename="1.mp3") 541 | response = requests.post( 542 | "http://doctor:5050/utils/audio/duration/", 543 | files=files, 544 | ) 545 | self.assertEqual(51.64, float(response.text), msg="Bad duration") 546 | 547 | 548 | class TestFailedValidations(unittest.TestCase): 549 | def test_for_400s(self): 550 | """Test validation for missing audio file""" 551 | response = requests.post( 552 | "http://doctor:5050/utils/audio/duration/", 553 | ) 554 | self.assertEqual(response.status_code, 400, msg="Wrong validation") 555 | 556 | def test_pdf_400s(self): 557 | """Test validation for missing PDF file""" 558 | response = requests.post( 559 | "http://doctor:5050/extract/doc/text/", 560 | ) 561 | self.assertEqual( 562 | "Failed validation", 563 | response.text, 564 | msg="Wrong validation error", 565 | ) 566 | self.assertEqual(response.status_code, 400, msg="Wrong validation") 567 | 568 | def test_pdf_400_mime(self): 569 | """Test return 400 on missing file for mime extraction""" 570 | response = requests.post( 571 | "http://doctor:5050/utils/mime-type/", 572 | params={"mime": True}, 573 | ) 574 | self.assertEqual(response.status_code, 400, msg="Wrong validation") 575 | 576 | 577 | class TestRecapWhitespaceInsertions(unittest.TestCase): 578 | """Test our whitespace insertion code""" 579 | 580 | def test_insert_whitespace_new_line(self): 581 | content = "foo" 582 | word = { 583 | "line_num": 2, 584 | "par_num": 1, 585 | "left": 50, 586 | "top": 200, 587 | "width": 10, 588 | "height": 20, 589 | } 590 | prev = { 591 | "line_num": 1, 592 | "par_num": 1, 593 | "left": 10, 594 | "top": 100, 595 | "width": 30, 596 | "height": 20, 597 | } 598 | result = insert_whitespace(content, word, prev) 599 | self.assertEqual(result, "foo\n ") 600 | 601 | def test_insert_whitespace_new_paragraph(self): 602 | content = "foo" 603 | word = { 604 | "line_num": 1, 605 | "par_num": 2, 606 | "left": 50, 607 | "top": 200, 608 | "width": 10, 609 | "height": 20, 610 | } 611 | prev = { 612 | "line_num": 2, 613 | "par_num": 1, 614 | "left": 10, 615 | "top": 100, 616 | "width": 30, 617 | "height": 20, 618 | } 619 | result = insert_whitespace(content, word, prev) 620 | self.assertEqual(result, "foo\n ") 621 | 622 | def test_insert_whitespace_vertical_gap(self): 623 | content = "foo" 624 | word = { 625 | "line_num": 2, 626 | "par_num": 1, 627 | "left": 50, 628 | "top": 300, 629 | "width": 10, 630 | "height": 20, 631 | } 632 | prev = { 633 | "line_num": 1, 634 | "par_num": 1, 635 | "left": 10, 636 | "top": 100, 637 | "width": 30, 638 | "height": 20, 639 | } 640 | result = insert_whitespace(content, word, prev) 641 | self.assertEqual(result, "foo\n\n ") 642 | 643 | def test_insert_whitespace_horizontal_gap(self): 644 | content = "foo" 645 | word = { 646 | "line_num": 1, 647 | "par_num": 1, 648 | "left": 200, 649 | "top": 100, 650 | "width": 10, 651 | "height": 20, 652 | } 653 | prev = { 654 | "line_num": 1, 655 | "par_num": 1, 656 | "left": 10, 657 | "top": 100, 658 | "width": 30, 659 | "height": 20, 660 | } 661 | result = insert_whitespace(content, word, prev) 662 | self.assertEqual(result, "foo ") 663 | 664 | def test_insert_whitespace_no_gap(self): 665 | content = "foo" 666 | word = { 667 | "line_num": 1, 668 | "par_num": 1, 669 | "left": 50, 670 | "top": 100, 671 | "width": 10, 672 | "height": 20, 673 | } 674 | prev = { 675 | "line_num": 1, 676 | "par_num": 1, 677 | "left": 40, 678 | "top": 100, 679 | "width": 10, 680 | "height": 20, 681 | } 682 | result = insert_whitespace(content, word, prev) 683 | self.assertEqual(result, "foo") 684 | 685 | 686 | class TestOCRConfidenceTests(unittest.TestCase): 687 | """Test our OCR confidence checking functions.""" 688 | 689 | def test_confidence_zero(self): 690 | word_dict = {"text": "foo", "conf": 0, "left": 10, "width": 30} 691 | result = get_word(word_dict, 612, True) 692 | self.assertEqual(result, " ") 693 | 694 | def test_confidence_low_and_in_margin(self): 695 | word_dict = {"text": "foo", "conf": 30, "left": 5, "width": 20} 696 | result = get_word(word_dict, 612, True) 697 | self.assertEqual(result, " ") 698 | 699 | def test_confidence_below_threshold_short_word(self): 700 | word_dict = {"text": "foo", "conf": 3, "left": 200, "width": 20} 701 | result = get_word(word_dict, 612, True) 702 | self.assertEqual(result, "□□□ ") 703 | 704 | def test_confidence_below_threshold_long_word(self): 705 | word_dict = { 706 | "text": "foobarbazfoobarbazfoobar", 707 | "conf": 3, 708 | "left": 200, 709 | "width": 200, 710 | } 711 | result = get_word(word_dict, 612, True) 712 | self.assertEqual(result, "□□□□□□□□□□□□□□□□□□□□□□□□ ") 713 | 714 | def test_confidence_below_threshold_in_right_margin(self): 715 | word_dict = {"text": "foo", "conf": 30, "left": 580, "width": 10} 716 | result = get_word(word_dict, 612, True) 717 | self.assertEqual(result, "□□□ ") 718 | 719 | def test_valid_word_high_confidence(self): 720 | word_dict = {"text": "foo", "conf": 90, "left": 50, "width": 20} 721 | result = get_word(word_dict, 612, True) 722 | self.assertEqual(result, "foo ") 723 | 724 | def test_word_on_left_edge(self): 725 | word_dict = {"text": "foo", "conf": 50, "left": 0, "width": 20} 726 | result = get_word(word_dict, 612, True) 727 | self.assertEqual(result, " ") 728 | 729 | 730 | class TestWhiteSpaceRemoval(unittest.TestCase): 731 | def test_left_shift(self): 732 | """Can we properly shift our text left?""" 733 | document = """ 734 | foo 735 | bar 736 | foo 737 | bar""" 738 | expected_result = """ foo 739 | bar 740 | foo 741 | bar""" 742 | result = remove_excess_whitespace(document) 743 | self.assertEqual(result, expected_result) 744 | 745 | def test_left_shift_when_artifact_exists(self): 746 | """Shift left once""" 747 | document = """ 748 | foo 749 | bar 750 | | foo 751 | bar""" 752 | expected_result = """ foo 753 | bar 754 | | foo 755 | bar""" 756 | result = remove_excess_whitespace(document) 757 | self.assertEqual(result, expected_result) 758 | 759 | 760 | class TestCleanupContent(unittest.TestCase): 761 | def setUp(self): 762 | # Patch the functions before each test method 763 | patcher1 = patch( 764 | "doctor.lib.text_extraction.adjust_caption_lines", 765 | side_effect=lambda x: x, 766 | ) 767 | patcher2 = patch( 768 | "doctor.lib.text_extraction.remove_excess_whitespace", 769 | side_effect=lambda x: x, 770 | ) 771 | self.mock_adjust = patcher1.start() 772 | self.mock_remove_whitespace = patcher2.start() 773 | self.addCleanup(patcher1.stop) 774 | self.addCleanup(patcher2.stop) 775 | 776 | def test_remove_floating_pipes(self): 777 | """Can we remove a pipe""" 778 | content = "This is a test line | \nAnother line" 779 | expected_result = "This is a test line\nAnother line\n" 780 | result = cleanup_content(content, 2) 781 | self.assertEqual(result, expected_result) 782 | 783 | def test_remove_floating_artifacts_right_side(self): 784 | """Can we remove an artifact on the far right""" 785 | content = "This is a test line e \nAnother line" 786 | expected_result = "This is a test line\nAnother line\n" 787 | result = cleanup_content(content, 2) 788 | self.assertEqual(result, expected_result) 789 | 790 | def test_remove_floating_pipes_and_artifacts(self): 791 | """Test to remove just the period""" 792 | content = "This is a test line | and the content continues\nThis is another test line e \nFinal line" 793 | expected_result = "This is a test line | and the content continues\nThis is another test line\nFinal line\n" 794 | result = cleanup_content(content, 2) 795 | self.assertEqual(result, expected_result) 796 | 797 | def test_no_floating_pipes_or_artifacts(self): 798 | """Test that no floating pipes are an issue""" 799 | content = ( 800 | "This is a test line JW-6\nAnother line\n" 801 | ) 802 | expected_result = ( 803 | "This is a test line JW-6\nAnother line\n\n" 804 | ) 805 | result = cleanup_content(content, 2) 806 | self.assertEqual(result, expected_result) 807 | 808 | def test_adjust_caption(self): 809 | """Test if we can align the caption correctly""" 810 | content = """ 10 811 | LESLIE MASSEY, ) Case No.: 2:16-cv-05001 GJS 812 | ) 813 | oe ) PROPOSED} ORDER AWARDING 814 | 12 Plaintiff, ) EQUAL ACCESS TO JUSTICE ACT 815 | ) ATTORNEY FEES AND EXPENSES 816 | 13 VS. ) PURSUANT TO 28 U.S.C. § 2412(d) 817 | NANCY A. BERRYHILL, Acting ) AND COSTS PURSUANT TO 28 818 | 14 || Commissioner of Social Security, ) U.S.C. § 1920 819 | 15 Defendant ) 820 | 16 ) """ 821 | 822 | expected_result = """ 10 823 | LESLIE MASSEY, ) Case No.: 2:16-cv-05001 GJS 824 | ) 825 | oe ) PROPOSED} ORDER AWARDING 826 | 12 Plaintiff, ) EQUAL ACCESS TO JUSTICE ACT 827 | ) ATTORNEY FEES AND EXPENSES 828 | 13 VS. ) PURSUANT TO 28 U.S.C. § 2412(d) 829 | NANCY A. BERRYHILL, Acting ) AND COSTS PURSUANT TO 28 830 | 14 || Commissioner of Social Security, ) U.S.C. § 1920 831 | 15 Defendant ) 832 | 16 ) """ 833 | content = adjust_caption_lines(content) 834 | self.assertEqual(expected_result, content) 835 | 836 | 837 | if __name__ == "__main__": 838 | unittest.main() 839 | -------------------------------------------------------------------------------- /doctor/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path, re_path 2 | 3 | from . import views 4 | 5 | urlpatterns = [ 6 | # Server 7 | path("", views.heartbeat, name="heartbeat"), 8 | path( 9 | "extract/doc/text/", 10 | views.extract_doc_content, 11 | name="convert-doc-to-text", 12 | ), 13 | path( 14 | "extract/recap/text/", 15 | views.extract_recap_document, 16 | name="extract-recap-document", 17 | ), 18 | path("convert/image/pdf/", views.image_to_pdf, name="image-to-pdf"), 19 | path("convert/images/pdf/", views.images_to_pdf, name="images-to-pdf"), 20 | path("convert/pdf/thumbnail/", views.make_png_thumbnail, name="thumbnail"), 21 | path( 22 | "convert/pdf/thumbnails/", 23 | views.make_png_thumbnails_from_range, 24 | name="thumbnails", 25 | ), 26 | re_path( 27 | "convert/audio/(mp3|ogg)/", views.convert_audio, name="convert-audio" 28 | ), 29 | path("utils/page-count/pdf/", views.page_count, name="page_count"), 30 | path("utils/mime-type/", views.extract_mime_type, name="mime_type"), 31 | path( 32 | "utils/file/extension/", views.extract_extension, name="file-extension" 33 | ), 34 | path( 35 | "utils/audio/duration/", 36 | views.fetch_audio_duration, 37 | name="audio-duration", 38 | ), 39 | path("utils/add/text/pdf/", views.embed_text, name="add-text-to-pdf"), 40 | path( 41 | "utils/document-number/pdf/", 42 | views.get_document_number, 43 | name="document-number-pdf", 44 | ), 45 | path("utils/check-redactions/pdf/", views.xray, name="xray-pdf"), 46 | ] 47 | -------------------------------------------------------------------------------- /doctor/views.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import mimetypes 3 | import re 4 | import shutil 5 | from http.client import BAD_REQUEST 6 | from tempfile import NamedTemporaryFile, TemporaryDirectory 7 | 8 | import eyed3 9 | import img2pdf 10 | import magic 11 | import pytesseract 12 | import requests 13 | from django.core.exceptions import BadRequest 14 | from django.http import FileResponse, HttpResponse, JsonResponse 15 | from lxml.etree import ParserError, XMLSyntaxError 16 | from PIL import Image 17 | from PyPDF2 import PdfReader, PdfWriter 18 | from pytesseract import Output 19 | 20 | from doctor.forms import ( 21 | AudioForm, 22 | BaseFileForm, 23 | DocumentForm, 24 | ImagePdfForm, 25 | MimeForm, 26 | ThumbnailForm, 27 | ) 28 | from doctor.lib.utils import ( 29 | cleanup_form, 30 | log_sentry_event, 31 | make_page_with_text, 32 | make_png_thumbnail_for_instance, 33 | make_png_thumbnails, 34 | strip_metadata_from_path, 35 | ) 36 | from doctor.tasks import ( 37 | convert_tiff_to_pdf_bytes, 38 | convert_to_mp3, 39 | convert_to_ogg, 40 | download_images, 41 | extract_from_doc, 42 | extract_from_docx, 43 | extract_from_html, 44 | extract_from_pdf, 45 | extract_from_txt, 46 | extract_from_wpd, 47 | extract_recap_pdf, 48 | get_document_number_from_pdf, 49 | get_page_count, 50 | get_xray, 51 | make_pdftotext_process, 52 | rasterize_pdf, 53 | set_mp3_meta_data, 54 | strip_metadata_from_bytes, 55 | ) 56 | 57 | logger = logging.getLogger(__name__) 58 | 59 | 60 | def heartbeat(request) -> HttpResponse: 61 | """Heartbeat endpoint 62 | 63 | :param request: The request object 64 | :return: Heartbeat 65 | """ 66 | return HttpResponse("Heartbeat detected.") 67 | 68 | 69 | def image_to_pdf(request) -> HttpResponse: 70 | """""" 71 | 72 | form = DocumentForm(request.POST, request.FILES) 73 | if not form.is_valid(): 74 | return HttpResponse("Failed validation", status=BAD_REQUEST) 75 | image = Image.open(form.cleaned_data["fp"]) 76 | pdf_bytes = convert_tiff_to_pdf_bytes(image) 77 | cleaned_pdf_bytes = strip_metadata_from_bytes(pdf_bytes) 78 | with NamedTemporaryFile(suffix=".pdf") as output: 79 | with open(output.name, "wb") as f: 80 | f.write(cleaned_pdf_bytes) 81 | cleanup_form(form) 82 | return HttpResponse(cleaned_pdf_bytes) 83 | 84 | 85 | def extract_recap_document(request) -> JsonResponse: 86 | """Extract Recap Documents 87 | 88 | :param request: The request object 89 | :return: JsonResponse 90 | """ 91 | form = DocumentForm(request.GET, request.FILES) 92 | if not form.is_valid(): 93 | return JsonResponse( 94 | { 95 | "err": "Failed validation", 96 | }, 97 | status=BAD_REQUEST, 98 | ) 99 | filepath = form.cleaned_data["fp"] 100 | strip_margin = form.cleaned_data["strip_margin"] 101 | content, extracted_by_ocr = extract_recap_pdf( 102 | filepath=filepath, 103 | strip_margin=strip_margin, 104 | ) 105 | cleanup_form(form) 106 | return JsonResponse( 107 | { 108 | "content": content, 109 | "extracted_by_ocr": extracted_by_ocr, 110 | } 111 | ) 112 | 113 | 114 | def extract_doc_content(request) -> JsonResponse | HttpResponse: 115 | """Extract txt from different document types. 116 | 117 | :return: The content of a document/error message. 118 | :type: json object 119 | """ 120 | form = DocumentForm(request.GET, request.FILES) 121 | if not form.is_valid(): 122 | return HttpResponse("Failed validation", status=BAD_REQUEST) 123 | ocr_available = form.cleaned_data["ocr_available"] 124 | extension = form.cleaned_data["extension"] 125 | fp = form.cleaned_data["fp"] 126 | extracted_by_ocr = False 127 | err = "" 128 | # We keep the original file name to use it for debugging purposes, you can find it in local_path (Opinion) field 129 | # or filepath_local (AbstractPDF). 130 | original_filename = form.cleaned_data["original_filename"] 131 | try: 132 | if extension == "pdf": 133 | content, err, returncode, extracted_by_ocr = extract_from_pdf( 134 | fp, original_filename, ocr_available 135 | ) 136 | elif extension == "doc": 137 | content, err, returncode = extract_from_doc(fp) 138 | elif extension == "docx": 139 | content, err, returncode = extract_from_docx(fp) 140 | elif extension == "html": 141 | content, err, returncode = extract_from_html(fp) 142 | elif extension == "txt": 143 | content, err, returncode = extract_from_txt(fp) 144 | elif extension == "wpd": 145 | content, err, returncode = extract_from_wpd(fp) 146 | else: 147 | returncode = 1 148 | err = "Unable to extract content due to unknown extension" 149 | content = "" 150 | 151 | if returncode != 0: 152 | log_sentry_event( 153 | logger=logger, 154 | level=logging.ERROR, 155 | message="Unable to extract document content", 156 | extra={ 157 | "file_name": original_filename, 158 | "err": err, 159 | }, 160 | exc_info=True, 161 | ) 162 | pass 163 | 164 | except (XMLSyntaxError, ParserError) as e: 165 | error_message = "HTML cleaning failed due to ParserError." 166 | if isinstance(e, XMLSyntaxError): 167 | error_message = "HTML cleaning failed due to XMLSyntaxError." 168 | 169 | log_sentry_event( 170 | logger=logger, 171 | level=logging.ERROR, 172 | message=error_message, 173 | extra={ 174 | "file_name": original_filename, 175 | "exception_type": type(e).__name__, 176 | "exception_message": str(e), 177 | }, 178 | exc_info=True, 179 | ) 180 | content = "Unable to extract the content from this file. Please try reading the original." 181 | 182 | # Get page count if you can 183 | page_count = get_page_count(fp, extension) 184 | cleanup_form(form) 185 | return JsonResponse( 186 | { 187 | "content": content, 188 | "err": err, 189 | "extension": extension, 190 | "extracted_by_ocr": extracted_by_ocr, 191 | "page_count": page_count, 192 | } 193 | ) 194 | 195 | 196 | def make_png_thumbnail(request) -> HttpResponse: 197 | """Make a thumbnail of the first page of a PDF and return it. 198 | 199 | :return: A response containing our file and any errors 200 | :type: HTTPS response 201 | """ 202 | form = ThumbnailForm(request.POST, request.FILES) 203 | if not form.is_valid(): 204 | return HttpResponse("Failed validation", status=BAD_REQUEST) 205 | document = form.cleaned_data["file"] 206 | with NamedTemporaryFile(suffix=".pdf") as tmp: 207 | with open(tmp.name, "wb") as f: 208 | f.write(document.read()) 209 | thumbnail, _, _ = make_png_thumbnail_for_instance( 210 | tmp.name, form.cleaned_data["max_dimension"] 211 | ) 212 | return HttpResponse(thumbnail) 213 | 214 | 215 | def make_png_thumbnails_from_range(request) -> HttpResponse: 216 | """Make a zip file that contains a thumbnail for each page requested. 217 | 218 | :return: A response containing our zip and any errors 219 | :type: HTTPS response 220 | """ 221 | form = ThumbnailForm(request.POST, request.FILES) 222 | if not form.is_valid(): 223 | return HttpResponse("Failed validation", status=BAD_REQUEST) 224 | 225 | directory = TemporaryDirectory() 226 | with NamedTemporaryFile(suffix=".pdf", mode="r+b") as temp_pdf: 227 | temp_pdf.write(form.cleaned_data["file"].read()) 228 | 229 | make_png_thumbnails( 230 | temp_pdf.name, 231 | form.cleaned_data["max_dimension"], 232 | form.cleaned_data["pages"], 233 | directory, 234 | ) 235 | 236 | with NamedTemporaryFile(suffix=".zip") as tmp_zip: 237 | filename = shutil.make_archive( 238 | f"{tmp_zip.name[:-4]}", "zip", directory.name 239 | ) 240 | return FileResponse(open(filename, "rb")) 241 | 242 | 243 | def xray(request) -> JsonResponse: 244 | """Check PDF for bad redactions 245 | 246 | :return: json with bounding boxes and text 247 | """ 248 | try: 249 | form = DocumentForm(request.POST, request.FILES) 250 | if not form.is_valid(): 251 | return JsonResponse( 252 | {"error": True, "msg": "Failed validation"}, status=BAD_REQUEST 253 | ) 254 | extension = form.cleaned_data["extension"] 255 | if extension.casefold() != "pdf": 256 | return JsonResponse( 257 | {"error": True, "msg": "Failed file type"}, status=BAD_REQUEST 258 | ) 259 | results = get_xray(form.cleaned_data["fp"]) 260 | if results.get("error", False): 261 | return JsonResponse(results, status=BAD_REQUEST) 262 | except Exception: 263 | pass 264 | finally: 265 | cleanup_form(form) 266 | return JsonResponse({"error": False, "results": results}) 267 | 268 | 269 | def page_count(request) -> HttpResponse: 270 | """Get page count from PDF 271 | 272 | :return: Page count 273 | """ 274 | form = DocumentForm(request.POST, request.FILES) 275 | if not form.is_valid(): 276 | return HttpResponse("Failed validation", status=BAD_REQUEST) 277 | extension = form.cleaned_data["extension"] 278 | pg_count = get_page_count(form.cleaned_data["fp"], extension) 279 | cleanup_form(form) 280 | return HttpResponse(pg_count) 281 | 282 | 283 | def extract_mime_type(request) -> JsonResponse | HttpResponse: 284 | """Identify the mime type of a document 285 | 286 | :return: Mime type 287 | """ 288 | form = DocumentForm(request.GET, request.FILES) 289 | if not form.is_valid(): 290 | return HttpResponse("Failed validation", status=BAD_REQUEST) 291 | mime = form.cleaned_data["mime"] 292 | mimetype = magic.from_file(form.cleaned_data["fp"], mime=mime) 293 | cleanup_form(form) 294 | return JsonResponse({"mimetype": mimetype}) 295 | 296 | 297 | def extract_extension(request) -> HttpResponse: 298 | """A handful of workarounds for getting extensions we can trust.""" 299 | form = MimeForm(request.GET, request.FILES) 300 | if not form.is_valid(): 301 | return HttpResponse("Failed validation", status=BAD_REQUEST) 302 | content = form.cleaned_data["file"].read() 303 | 304 | file_str = magic.from_buffer(content) 305 | if file_str.startswith("Composite Document File V2 Document"): 306 | # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed 307 | # in libmagic 5.11-2. 308 | mime = "application/msword" 309 | elif file_str == "(Corel/WP)": 310 | mime = "application/vnd.wordperfect" 311 | elif file_str == "C source, ASCII text": 312 | mime = "text/plain" 313 | elif file_str.startswith("WordPerfect document"): 314 | mime = "application/vnd.wordperfect" 315 | elif re.findall( 316 | r"(Audio file with ID3.*MPEG.*layer III)|(.*Audio Media.*)", file_str 317 | ): 318 | mime = "audio/mpeg" 319 | else: 320 | # No workaround necessary 321 | mime = magic.from_buffer(content, mime=True) 322 | extension = mimetypes.guess_extension(mime) 323 | if extension == ".obj": 324 | # It could be a wpd, if it's not a PDF 325 | if "PDF" in content[0:40]: 326 | # Does 'PDF' appear in the beginning of the content? 327 | extension = ".pdf" 328 | else: 329 | extension = ".wpd" 330 | 331 | # The extension is .bin, look in the content if we can infer the 332 | # content type as pdf. See: https://bugs.astron.com/view.php?id=446 333 | if extension == ".bin": 334 | # Check if %PDF-X.X is in the first 1024 bytes of content 335 | pattern = rb"%PDF-[0-9]+(\.[0-9]+)?" 336 | matches = re.search(pattern, content[:1024]) 337 | if matches: 338 | # Document contains a pdf version, so the file must be a pdf 339 | extension = ".pdf" 340 | 341 | fixes = { 342 | ".htm": ".html", 343 | ".xml": ".html", 344 | ".wsdl": ".html", 345 | ".ksh": ".txt", 346 | ".asf": ".wma", 347 | ".dot": ".doc", 348 | } 349 | return HttpResponse(fixes.get(extension, extension).lower()) 350 | 351 | 352 | def pdf_to_text(request) -> JsonResponse | HttpResponse: 353 | """Extract text from text based PDFs immediately. 354 | 355 | :return: 356 | """ 357 | form = DocumentForm(request.POST, request.FILES) 358 | if not form.is_valid(): 359 | return HttpResponse("Failed validation", status=BAD_REQUEST) 360 | content, err, _ = make_pdftotext_process(form.cleaned_data["fp"]) 361 | cleanup_form(form) 362 | return JsonResponse( 363 | "content", 364 | content, 365 | "err", 366 | err, 367 | ) 368 | 369 | 370 | def images_to_pdf(request) -> HttpResponse: 371 | """ 372 | 373 | :param request: 374 | :return: 375 | """ 376 | form = ImagePdfForm(request.GET) 377 | if not form.is_valid(): 378 | raise BadRequest("Invalid form") 379 | sorted_urls = form.cleaned_data["sorted_urls"] 380 | 381 | if len(sorted_urls) > 1: 382 | image_list = download_images(sorted_urls) 383 | with NamedTemporaryFile(suffix=".pdf") as tmp: 384 | with open(tmp.name, "wb") as f: 385 | f.write(img2pdf.convert(image_list)) 386 | cleaned_pdf_bytes = strip_metadata_from_path(tmp.name) 387 | else: 388 | tiff_image = Image.open( 389 | requests.get(sorted_urls[0], stream=True, timeout=60 * 5).raw 390 | ) 391 | pdf_bytes = convert_tiff_to_pdf_bytes(tiff_image) 392 | cleaned_pdf_bytes = strip_metadata_from_bytes(pdf_bytes) 393 | return HttpResponse(cleaned_pdf_bytes, content_type="application/pdf") 394 | 395 | 396 | def fetch_audio_duration(request) -> HttpResponse: 397 | """Fetch audio duration from file.""" 398 | try: 399 | form = AudioForm(request.GET, request.FILES) 400 | if not form.is_valid(): 401 | return HttpResponse("Failed validation", status=BAD_REQUEST) 402 | with NamedTemporaryFile(suffix=".mp3") as tmp: 403 | with open(tmp.name, "wb") as f: 404 | for chunk in form.cleaned_data["file"].chunks(): 405 | f.write(chunk) 406 | mp3_file = eyed3.load(tmp.name) 407 | return HttpResponse(mp3_file.info.time_secs) 408 | except Exception as e: 409 | return HttpResponse(str(e)) 410 | 411 | 412 | def convert_audio(request, output_format: str) -> FileResponse | HttpResponse: 413 | """Converts an uploaded audio file to the specified output format and 414 | updates its metadata. 415 | 416 | :return: Converted audio 417 | """ 418 | form = AudioForm(request.GET, request.FILES) 419 | if not form.is_valid(): 420 | return HttpResponse("Failed validation", status=BAD_REQUEST) 421 | filepath = form.cleaned_data["fp"] 422 | media_file = form.cleaned_data["file"] 423 | audio_data = {k: v[0] for k, v in dict(request.GET).items()} 424 | match output_format: 425 | case "mp3": 426 | convert_to_mp3(filepath, media_file) 427 | set_mp3_meta_data(audio_data, filepath) 428 | case "ogg": 429 | convert_to_ogg(filepath, media_file) 430 | case _: 431 | raise NotImplementedError 432 | response = FileResponse( 433 | open(filepath, "rb") # noqa: SIM115 FileResponse closes the file 434 | ) 435 | cleanup_form(form) 436 | return response 437 | 438 | 439 | def embed_text(request) -> FileResponse | HttpResponse: 440 | """Embed text onto an image PDF. 441 | 442 | :return: Embedded PDF 443 | """ 444 | form = DocumentForm(request.GET, request.FILES) 445 | if not form.is_valid(): 446 | return HttpResponse("Failed validation", status=BAD_REQUEST) 447 | fp = form.cleaned_data["fp"] 448 | with NamedTemporaryFile(suffix=".tiff") as destination: 449 | rasterize_pdf(fp, destination.name) 450 | data = pytesseract.image_to_data( 451 | destination.name, output_type=Output.DICT 452 | ) 453 | image = Image.open(destination.name) 454 | w, h = image.width, image.height 455 | output = PdfWriter() 456 | with open(fp, "rb") as f: 457 | existing_pdf = PdfReader(f) 458 | for page in range(0, len(existing_pdf.pages)): 459 | packet = make_page_with_text(page + 1, data, h, w) 460 | new_pdf = PdfReader(packet) 461 | page = existing_pdf.pages[page] 462 | page.merge_page(new_pdf.pages[0]) 463 | output.add_page(page) 464 | 465 | with NamedTemporaryFile(suffix=".pdf") as pdf_destination: 466 | with open(pdf_destination.name, "wb") as outputStream: 467 | output.write(outputStream) 468 | response = FileResponse( 469 | open( # noqa: SIM115 FileResponse closes the file 470 | pdf_destination.name, "rb" 471 | ) 472 | ) 473 | cleanup_form(form) 474 | return response 475 | 476 | 477 | def get_document_number(request) -> HttpResponse: 478 | """Get PACER document number from PDF 479 | 480 | :param request: The request object 481 | :return: PACER document number 482 | """ 483 | 484 | form = BaseFileForm(request.GET, request.FILES) 485 | if not form.is_valid(): 486 | validation_message = form.errors.get_json_data()["__all__"][0][ 487 | "message" 488 | ] 489 | return HttpResponse(validation_message, status=BAD_REQUEST) 490 | fp = form.cleaned_data["fp"] 491 | document_number = get_document_number_from_pdf(fp) 492 | cleanup_form(form) 493 | return HttpResponse(document_number) 494 | -------------------------------------------------------------------------------- /doctor/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for doctor project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "doctor.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | 4 | import os 5 | import sys 6 | 7 | 8 | def main(): 9 | """Run administrative tasks.""" 10 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "doctor.settings") 11 | try: 12 | from django.core.management import execute_from_command_line 13 | except ImportError as exc: 14 | raise ImportError( 15 | "Couldn't import Django. Are you sure it's installed and " 16 | "available on your PYTHONPATH environment variable? Did you " 17 | "forget to activate a virtual environment?" 18 | ) from exc 19 | execute_from_command_line(sys.argv) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "doctor" 3 | version = "1" 4 | description = "CourtListener doctor" 5 | requires-python = ">=3.10" 6 | dependencies = [ 7 | "certifi", 8 | "chardet>=3.0.4", 9 | "django>=3.2,<4", 10 | "django-environ>=0.8.1", 11 | "eyed3", 12 | "gunicorn==20.1", 13 | "idna==2.10", 14 | "img2pdf", 15 | "lxml>=4.5.2", 16 | "lxml-html-clean", 17 | "numpy>=1.19.1", 18 | "opencv-python>=4.2.0.32", 19 | "pandas>=1.1.1", 20 | "pdf2image>=1.7.1", 21 | "pdfplumber", 22 | "pillow>=8.0.1", 23 | "pkginfo==1.5.0.1", 24 | "pypdf2[crypto]", 25 | "pytesseract>=0.3.5", 26 | "python-magic", 27 | "reportlab", 28 | "requests>=2.25", 29 | "seal-rookery>=2.2.1", 30 | "sentry-sdk", 31 | "six>=1.15", 32 | "urllib3>=1.25.10", 33 | "x-ray==0.3.3", 34 | ] 35 | 36 | [dependency-groups] 37 | dev = [ 38 | "ipython", 39 | ] 40 | 41 | [tool.ruff] 42 | line-length = 79 43 | lint.select = [ 44 | # flake8-bugbear 45 | "B", 46 | # flake8-comprehensions 47 | "C4", 48 | # pycodestyle 49 | "E", 50 | # Pyflakes errors 51 | "F", 52 | # isort 53 | "I", 54 | # flake8-simplify 55 | "SIM", 56 | # flake8-tidy-imports 57 | "TID", 58 | # pyupgrade 59 | "UP", 60 | # Pyflakes warnings 61 | "W", 62 | ] 63 | lint.ignore = [ 64 | # flake8-bugbear opinionated rules 65 | "B9", 66 | # line-too-long 67 | "E501", 68 | # suppressible-exception 69 | "SIM105", 70 | # if-else-block-instead-of-if-exp 71 | "SIM108", 72 | ] 73 | --------------------------------------------------------------------------------