├── .dockerignore
├── .editorconfig
├── .env.example
├── .gitattributes
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── deploy.yml
    │   ├── lint.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── DEVELOPING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── compose.yaml
├── docker
    ├── Dockerfile
    └── Makefile
├── doctor
    ├── __init__.py
    ├── assets
    │   └── producer-300x300.png
    ├── forms.py
    ├── lib
    │   ├── __init__.py
    │   ├── mojibake.py
    │   ├── text_extraction.py
    │   └── utils.py
    ├── settings.py
    ├── tasks.py
    ├── test_assets
    │   ├── 1.mp3
    │   ├── 1.wma
    │   ├── 1_with_metadata.mp3
    │   ├── ander_v._leo.mp3
    │   ├── broken-mime.pdf
    │   ├── empty.pdf
    │   ├── image-pdf-2-thumbnail.png
    │   ├── image-pdf-2.pdf
    │   ├── image-pdf-thumbnail.png
    │   ├── image-pdf.pdf
    │   ├── long-image.tiff
    │   ├── missouri.pdf
    │   ├── ocr_pdf_variation.pdf
    │   ├── recap_documents
    │   │   ├── ca10_010110462922.pdf
    │   │   ├── ca1_00117684624.pdf
    │   │   ├── ca2_1-1.pdf
    │   │   ├── ca3_003112692106.pdf
    │   │   ├── ca4_17.pdf
    │   │   ├── ca5_00516242060.pdf
    │   │   ├── ca6_1-3.pdf
    │   │   ├── ca7_3.pdf
    │   │   ├── ca8_.pdf
    │   │   ├── ca9_19.pdf
    │   │   └── cafc_3.pdf
    │   ├── recap_extract
    │   │   ├── gov.uscourts.azd.1085839.3.0.pdf
    │   │   ├── gov.uscourts.cacd.652774.40.0.pdf
    │   │   └── gov.uscourts.cand.203070.27.0.pdf
    │   ├── vector-pdf.pdf
    │   ├── word-doc.doc
    │   ├── word-docx.docx
    │   ├── word-perfect.wpd
    │   └── x-ray
    │   │   ├── rectangles_no.pdf
    │   │   ├── rectangles_yes.pdf
    │   │   └── rectangles_yes_2.pdf
    ├── tests.py
    ├── urls.py
    ├── views.py
    └── wsgi.py
├── manage.py
├── pyproject.toml
└── uv.lock


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .venv
3 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | indent_style = space
 8 | indent_size = 4
 9 | end_of_line = lf
10 | insert_final_newline = true
11 | trim_trailing_whitespace = true
12 | 
13 | [*.py]
14 | max_line_length = 79
15 | 
16 | [*.{js,html,json,css,yml,yaml}]
17 | indent_size = 2
18 | 
19 | [*.md]
20 | trim_trailing_whitespace = false
21 | 
22 | # The JSON files contain newlines inconsistently
23 | [*.json]
24 | insert_final_newline = ignore
25 | 
26 | # Minified JavaScript files shouldn't be changed
27 | [**.min.js]
28 | indent_style = ignore
29 | insert_final_newline = ignore
30 | 
31 | # Makefiles always use tabs for indentation
32 | [Makefile]
33 | indent_style = tab
34 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DEBUG=on
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Prevent Windows git clients for goofing up files that must run on Linux
 2 | * text eol=lf
 3 | 
 4 | # Image files
 5 | *.png binary
 6 | *.jpg binary
 7 | *.jpeg binary
 8 | *.gif binary
 9 | *.psd binary
10 | 
11 | # Audio files
12 | *.wma binary
13 | *.mp3 binary
14 | 
15 | # Compressed files
16 | *.jar binary
17 | *.exe binary
18 | *.bz2 binary
19 | *.gz binary
20 | *.zip binary
21 | 
22 | # Fonts
23 | *.eot binary
24 | *.otf binary
25 | *.ttf binary
26 | *.woff binary
27 | *.woff2 binary
28 | 
29 | # File formats
30 | *.ods binary
31 | *.pdf binary
32 | *.xls binary
33 | *.wpd binary
34 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: freelawproject
4 | custom: https://www.courtlistener.com/donate/?referrer=github-courtlistener
5 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Automate build and deploy
 2 | on:
 3 |   pull_request:
 4 |     branches: [ "main" ]
 5 |     types:
 6 |      - closed
 7 | 
 8 | env:
 9 |   AWS_REGION: us-west-2
10 |   EKS_CLUSTER_NAME: courtlistener
11 |   EKS_NAMESPACE: court-listener
12 | 
13 | jobs:
14 |   build:
15 |     # Build only merged PRs
16 |     if: (github.event_name == 'pull_request' && github.event.pull_request.merged == true)
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - name: Login to Docker Hub
21 |         uses: docker/login-action@v3
22 |         with:
23 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
24 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
25 |       - name: Build and Push
26 |         run: |
27 |           make push --file docker/Makefile -e VERSION=$(git rev-parse --short HEAD)
28 | 
29 |   deploy:
30 |     needs: build
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |     - uses: actions/checkout@v4
34 |     - name: Set shortcode
35 |       id: vars
36 |       run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
37 |     - name: Configure AWS credentials
38 |       uses: aws-actions/configure-aws-credentials@v4
39 |       with:
40 |         aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
41 |         aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
42 |         aws-region: ${{ env.AWS_REGION }}
43 |     - name: Create Kubeconfig with AWS CLI
44 |       run: aws eks update-kubeconfig --region ${{ env.AWS_REGION }} --name ${{ env.EKS_CLUSTER_NAME }}
45 |     - name: Rollout cl-doctor
46 |       run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-doctor doctor=freelawproject/doctor:${{ steps.vars.outputs.sha_short }}
47 |     - name: Watch cl-doctor rollout status
48 |       run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-doctor
49 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   pre-commit:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - uses: actions/setup-python@v5
15 |         with:
16 |           python-version: "3.10"
17 |       - uses: pre-commit/action@v3.0.1
18 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 | 
15 |     - name: Create the .env settings file
16 |       run: cp .env.example .env.dev
17 | 
18 |     - name: Update .env.dev file
19 |       run: |
20 |         echo 'DEBUG=on' >> .env.dev
21 | 
22 |     - name: Build Image
23 |       run: docker compose up --build -d
24 | 
25 |     - name: Run tests
26 |       run: docker compose exec doctor python -m unittest
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pipenv
 85 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 86 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 87 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 88 | #   install all needed dependencies.
 89 | #Pipfile.lock
 90 | 
 91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 92 | __pypackages__/
 93 | 
 94 | # Celery stuff
 95 | celerybeat-schedule
 96 | celerybeat.pid
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # Ignore JetBrains files
128 | .idea
129 | 
130 | # Env file
131 | .env.dev
132 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | exclude: migrations
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.5.0
 7 |     hooks:
 8 |      - id: check-added-large-files
 9 |      - id: check-ast
10 |      - id: check-json
11 |      - id: check-merge-conflict
12 |      - id: check-toml
13 |      - id: check-xml
14 |      - id: check-yaml
15 |      - id: debug-statements
16 |      - id: detect-private-key
17 |      - id: fix-byte-order-marker
18 |      - id: fix-encoding-pragma
19 |        args: [--remove]
20 |      - id: trailing-whitespace
21 |        args: [--markdown-linebreak-ext=md]
22 | 
23 |   - repo: https://github.com/astral-sh/ruff-pre-commit
24 |     rev: v0.11.8
25 |     hooks:
26 |       - id: ruff
27 |         args: [ --fix ]
28 |       - id: ruff-format
29 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## Current
  2 | 
  3 | **0.3.1 - 2023-01-17**
  4 | 
  5 | Features:
  6 |  - Adds /convert/pdf/thumbnails/ service that returns a zip file with thumbnails from a PDF document.
  7 | 
  8 | ## Previous Versions
  9 | 
 10 | **0.3.0 - 2022-09-30**
 11 | 
 12 | Features:
 13 |  - Code cleanup and reformatting.
 14 |  - Documentation enhancements.
 15 |  - Removal of dead code.
 16 | 
 17 | Changes:
 18 |  - Removes a number of URLs that were duplicative:
 19 |    - `/extract/pdf/text/`: Use `/extract/doc/text/` instead. They ran the same code under the covers, but their returns are slightly different. Generally, if you used something like `response.text` before, you should use `response.json()["content"]` now.
 20 |    - `/utils/file/mime/`: Use `/utils/mime-type/` instead. If you used `response.json()["mime"]` before, use `response.json()["mimetype"]` now.
 21 |    - These endpoints were never documented:
 22 |      - `/text/`, `/document/pdf-to-text/`, and `/extract-doc-content/`: Use `/extract/doc/text/` instead.
 23 |      - `/pg-count/` and `/document/page_count/`:  Use `/utils/page-count/pdf/` instead.
 24 |      - `/mime-type/`: Use `/utils/mime-type/` instead.
 25 |      - `/image-to-pdf/`: Use `/convert/image/pdf/` instead.
 26 |      - `/images-to-pdf/`: Use `/convert/images/pdf/` instead.
 27 |      - `/thumbnail/`: Use `/convert/pdf/thumbnail/` instead.
 28 |      - `/convert-audio/`: Use `/convert/audio/mp3/` instead.
 29 |      - `/document/thumbnail/`: Use `/convert/pdf/thumbnail/` instead.
 30 |  - Tweaks the tests to use new container names that are less likely to conflict with existing containers.
 31 | 
 32 | **0.2.16 - 2022-09-28**
 33 | 
 34 | Features:
 35 |  - Adds /utils/document-number/pdf/ service that returns the PACER document number from a RECAP PDF document.
 36 | 
 37 | **0.2.15 - 2022-07-27**
 38 | 
 39 | Fixes:
 40 |  - Adds PyCryptodome in order to handle encrypted PDFs ([144](https://github.com/freelawproject/doctor/issues/144))
 41 | 
 42 | **0.2.14 - 2022-07-26**
 43 | 
 44 | Features:
 45 |  - Adds sentry integration
 46 |  - Adds django-environ to allow environment variables for Django settings
 47 | 
 48 | **0.2.13 - 2022-06-02**
 49 | 
 50 | This release is focused on performance improvements and easier scaling. It:
 51 | 
 52 |  - Disables multi-threaded tesseract code. This makes it easier to scale doctor in a k8s environment due to at most one CPU being used per conversion.
 53 |  - Sets the number of gunicorn workers to 1 by default. This makes it so that scaling is can be moved to k8s instead of gunicorn.
 54 |  - Tells tesseract not to look for white text on black backgrounds. This is just a simple performance tweak.
 55 |  - Upgrades to PyPDF2 version 2.0.0.
 56 | 
 57 | **0.2.12 - 2022-05-19**
 58 | 
 59 | Features:
 60 |  - Add an even better encoding for extract_from_html
 61 | 
 62 | **0.2.11 - 2022-05-12**
 63 | 
 64 | Features:
 65 |  - Add even better encoding for extract_from_html
 66 |  - Add better error message
 67 | 
 68 | **0.2.10 - 2022-05-02**
 69 | 
 70 | Features:
 71 |  - Adds better encoding for extract_from_html
 72 |  - Bump seal-rookery to 2.2.1
 73 |  - Update seal-rookery call
 74 | 
 75 | **0.2.9 - 2022-04-19**
 76 | 
 77 | Features:
 78 |  - Fix for mime type detection for weird PDF failures
 79 |  - Test for broken PDFs
 80 | 
 81 | **0.2.8 - 2022-04-14**
 82 | 
 83 | Features:
 84 |  - Drop m1 specific docker builds.
 85 |  - Return 406's when validation of forms fails
 86 |  - Add tests for incomplete post requests to the server.
 87 |  - Reduce build installs and build install time.
 88 | 
 89 | **0.2.7 - 2022-04-12**
 90 | 
 91 | Features:
 92 |  - Bump seal-rookey to speed up builds.
 93 |  - Add m1 build in Makefile.
 94 | 
 95 | **0.2.6 - 2022-04-12**
 96 | 
 97 | Fixes:
 98 |  - Add additional workers and worker resets to the gunicorn configuration. The
 99 |    default is now four workers, and additional ones can be created with the
100 |    DOCTOR_WORKERS env.
101 | 
102 | **0.2.5 - 2022-03-24**
103 | 
104 | Features:
105 |  - Add two new endpoints
106 |  - Extensions from blob
107 |  - Mime type from blob
108 | 
109 | Changes:
110 |  - Drop NGINX
111 |  - Combine installation
112 | 
113 | 
114 | **0.2.4 - 2022-03-23**
115 | 
116 | Features:
117 |  - Refactor document/extract/ endpoint to return json and drop cookies
118 | 
119 | Changes:
120 |  - Fix dockerfile update-seals
121 |  - Drop cookie support and use JSON responses when necessary
122 |  - Update tests
123 |  - Update heartbeat to match disclosure endpoint
124 | 
125 | **0.2.3 - 2022-03-22**
126 | 
127 | Features:
128 |  - Update type of response object
129 |  - Drop json response success = False if invalid form and just return Bad Request
130 | 
131 | Changes:
132 | 
133 | 
134 | **0.2.2 - 2022-03-21**
135 | 
136 | Features:
137 |  - Split audio conversion into two steps: first convert to mp3
138 |    and a second method to fetch audio duration..
139 | 
140 | Changes:
141 |  - Update readme.
142 |  - Bump version to 0.2.2
143 |  - Update tests for new endpoint.
144 | 
145 | 
146 | **0.2.1 - 2022-03-18**
147 | 
148 | Features:
149 |  - Update nginx config for longer timeouts
150 | 
151 | Changes:
152 |  - Update nginx config for longer timeouts
153 |  - Bump python version for linting
154 |  - Fix typo in DEVELOPING.md
155 | 
156 | **0.2.0 - 2022-03-16**
157 | 
158 | Features:
159 |  - Greatly improved documentation
160 |  - Improved speed
161 | 
162 | Changes:
163 |  - Overhauled the entire codebase
164 |  - Dropped seal-rookery image
165 |  - Switched to Django and gunicorn from uWSGI and Flask
166 |  - Completed api tests
167 |  - Added Makefile for building and pushing
168 |  - Updated NGINX config
169 |  - Added DEVELOPING.md
170 |  - Added composefile for testing with or without docker networking
171 |  - Removed financial disclosures (coming soon as a separate project).
172 |  - General improvements and cleanup.
173 |  - Add support for multiple architectures. (linux/amd64,linux/arm64)
174 |  - Added changelog
175 | 
176 | 
177 | **0.1.0 - 2021-11-08**
178 | 
179 | 
180 | **0.0.36 - 2021-05-11**
181 | 
182 | 
183 | **0.0.36 - 2021-03-17**
184 | 


--------------------------------------------------------------------------------
/DEVELOPING.md:
--------------------------------------------------------------------------------
 1 | This is a microservice, so tests are designed to be run from a mock web
 2 | application that calls to this service.  
 3 | 
 4 | ## Quick start
 5 | 
 6 | To build the microservice and start it up, run:
 7 | 
 8 |     docker compose up --build -d
 9 | 
10 | To see logs:
11 | 
12 |     docker compose logs -f
13 | 
14 | If you want to see debug logs, set `DEBUG` to `True` in `settings.py`.
15 | 
16 | 
17 | ## Testing
18 | 
19 | Once the above compose file is running, you can use the `mock_web_app`
20 | container to run the tests against the `doctor` container:
21 | 
22 |     docker exec -it mock_web_app python3 -m unittest doctor.tests
23 | 
24 | 
25 | ## Building Images
26 | 
27 | Generally, images are automatically built and pushed to the docker repo when
28 | PRs are merged. If it needs to happen manually, try this:
29 | 
30 | `make image --file docker/Makefile`
31 | 
32 | And pushed with:
33 | 
34 | `make push--file docker/Makefile`
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2020, Free Law Project
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | Doctor
  3 | ------------------------------------
  4 | 
  5 | Welcome to Doctor, Free Law Project's microservice for converting, extracting and modifying documents and audio files.
  6 | 
  7 | At a high level, this service provides you with high-performance HTTP endpoints that can:
  8 | 
  9 |  - Extract text from various types of documents
 10 |  - Convert audio files from one format to another while stripping messy metadata
 11 |  - Create thumbnails of PDFs
 12 |  - Provide metadata about PDFs
 13 | 
 14 | Under the hood, Doctor uses gunicorn to connect to a django service. The django service uses
 15 | carefully configured implementations of `ffmpeg`, `pdftotext`, `tesseract`, `ghostscript`, and a
 16 | number of other converters.
 17 | 
 18 | 
 19 | Quick Start
 20 | -----------
 21 | 
 22 | Assuming you have docker installed run:
 23 | 
 24 |     docker run -d -p 5050:5050 freelawproject/doctor:latest
 25 | 
 26 | This will expose the endpoints on port 5050 with one gunicorn worker. This is usually ideal because it allows you to horizontally scale Doctor using an orchestration system like Kubernetes.
 27 | 
 28 | If you are not using a system that supports horizontal scaling, you may wish to have more gunicorn workers so that Doctor can handle more simultaneous tasks. To set that up, simply set the DOCTOR_WORKERS environment variable:
 29 | 
 30 |     docker run -d -p 5050:5050 -e DOCTOR_WORKERS=16 freelawproject/doctor:latest
 31 | 
 32 | If you are doing OCR or audio conversion, scaling through a system like Kubernetes or through by giving Doctor many workers becomes particularly important. If it does not have a worker available, your call to Doctor will probably time out.
 33 | 
 34 | After the image is running, you should be able to test that you have a working environment by running
 35 | 
 36 |     curl http://localhost:5050
 37 | 
 38 | which should return a text response:
 39 | 
 40 |     Heartbeat detected.
 41 | 
 42 | 
 43 | ENDPOINTS
 44 | -------------
 45 | 
 46 | ## Overview
 47 | 
 48 | The service currently supports the following tools:
 49 | 
 50 | 1. Extract text from PDF, RTF, DOC, DOCX, or WPD, HTML, TXT files.
 51 | 1. OCR text from a scanned PDF.
 52 | 1. Get page count for a PDF document.
 53 | 1. Check for bad redactions in a PDF document.
 54 | 1. Convert audio files from wma, ogg, wav to MP3.
 55 | 1. Create a thumbnail of the first page of a PDF (for use in Open Graph tags)
 56 | 1. Convert an image or images to a PDF.
 57 | 1. Identify the mime type of a file.
 58 | 
 59 | 
 60 | A brief description and curl command for each endpoint is provided below.
 61 | 
 62 | ## Extractors
 63 | 
 64 | ### Endpoint: /extract/doc/text/
 65 | 
 66 | Given a document, extract out the text and assorted metadata. Supports the following document types:
 67 | 
 68 |  - `pdf` - Adobe portable document format files, via `pdftotext`.
 69 |  - `doc` - Word document files, via `antiword`.
 70 |  - `docx` - Open Office XML files, via `docx2txt`.
 71 |  - `html` - HTML files, via `lxml.html.clean.Cleaner`. Strips out dangerous tags and hoists their contents to their parent. Hoisted tags include: `a`, `body`, `font`, `noscript`, and `img`.
 72 |  - `txt` - Text files. This attempts to normalize all encoding questions to utf-8. First, we try cp1251, then utf-8, ignoring errors.
 73 |  - `wpd` - Word Perfect files, via `wpd2html` followed by cleaning the HTML as above.
 74 | 
 75 | ```bash
 76 | curl 'http://localhost:5050/extract/doc/text/' \
 77 |   -X 'POST' \
 78 |   -F "file=@doctor/test_assets/vector-pdf.pdf"
 79 | ```
 80 | 
 81 | Parameters:
 82 | 
 83 |  - `ocr_available`: Whether doctor should use tesseract to provide OCR services for the document. OCR is always possible in doctor, but sometimes you won't want to use it, since it can be slow. If you want it disabled for this request, omit this optional parameter. To enable it, set ocr_available to `True`:
 84 | 
 85 | ```bash
 86 | curl 'http://localhost:5050/extract/doc/text/?ocr_available=True' \
 87 |   -X 'POST' \
 88 |   -F "file=@doctor/test_assets/image-pdf.pdf"
 89 | ```
 90 | 
 91 | Magic:
 92 | 
 93 |  - The mimetype of the file will be determined by the name of the file you pass in. For example, if you pass in medical_assessment.pdf, the `pdf` extractor will be used.
 94 | 
 95 | Valid requests will receive a JSON response with the following keys:
 96 | 
 97 |  - `content`: The utf-8 encoded text of the file
 98 |  - `err`: An error message, if one should occur.
 99 |  - `extension`: The sniffed extension of the file.
100 |  - `extracted_by_ocr`: Whether OCR was needed and used during processing.
101 |  - `page_count`: The number of pages, if it applies.
102 | 
103 | ### Endpoint: /extract/recap/text/
104 | 
105 | Given a RECAP pdf, extract out the text using PDF Plumber, OCR or a combination of the two
106 | 
107 | Parameters:
108 | 
109 |  - `strip_margin`: Whether doctor should crop the edges of the recap document during processing. With PDF plumber it will ignore traditional 1 inch margin.  With an OCR it lowers the threshold for hiding OCR gibberish. To enable it, set strip_margin to `True`:
110 | 
111 | ```bash
112 | curl 'http://localhost:5050/extract/recap/text/?strip_margin=True' \
113 |   -X 'POST' \
114 |   -F "file=@doctor/recap_extract/gov.uscourts.cacd.652774.40.0.pdf"
115 | ```
116 | 
117 | Valid requests will receive a JSON response with the following keys:
118 | 
119 |  - `content`: The utf-8 encoded text of the file
120 |  - `extracted_by_ocr`: Whether OCR was needed and used during processing.
121 | 
122 | 
123 | ## Utilities
124 | 
125 | ### Endpoint: /utils/page-count/pdf/
126 | 
127 | This method takes a document and returns the page count.
128 | 
129 |     curl 'http://localhost:5050/utils/page-count/pdf/' \
130 |      -X 'POST' \
131 |      -F "file=@doctor/test_assets/image-pdf.pdf"
132 | 
133 | This will return an HTTP response with page count.  In the above example it would return __2__.
134 | 
135 | ### Endpoint: /utils/check-redactions/pdf/
136 | 
137 | This method takes a document and returns the bounding boxes of bad
138 | redactions as well as any discovered text.
139 | 
140 |     curl 'http://localhost:5050/utils/check-redactions/pdf/' \
141 | 	  -X 'POST' \
142 | 	  -F "file=@doctor/test_assets/x-ray/rectangles_yes.pdf"
143 | 
144 | returns as JSON response with bounding box(es) and text recovered.
145 | ```
146 | {
147 |   "error": false,
148 |   "results": {
149 |     "1": [
150 |       {
151 |         "bbox": [
152 |           412.54998779296875,
153 |           480.6099853515625,
154 |           437.8699951171875,
155 |           494.39996337890625
156 |         ],
157 |         "text": "“No”"
158 |       },
159 |       {
160 |         "bbox": [
161 |           273.3500061035156,
162 |           315,
163 |           536.8599853515625,
164 |           328.79998779296875
165 |         ],
166 |         "text": "“Yes”, but did not disclose all relevant medical history"
167 |       },
168 |       {
169 |         "bbox": [
170 |           141.22999572753906,
171 |           232.20001220703125,
172 |           166.54998779296875,
173 |           246
174 |         ],
175 |         "text": "“No”"
176 |       }
177 |     ]
178 |   }
179 | }
180 | ```
181 | 
182 | The "error" field is set if there was an issue processing the PDF.
183 | 
184 | If "results" is empty there were no bad redactions found otherwise it
185 | is a list of bounding box along with the text recovered.
186 | 
187 | See: https://github.com/freelawproject/x-ray/#readme
188 | 
189 | ### Endpoint: /utils/mime-type/
190 | 
191 | This method takes a document and returns the mime type.
192 | 
193 |     curl 'http://localhost:5050/utils/mime-type/?mime=False' \
194 |      -X 'POST' \
195 |      -F "file=@doctor/test_assets/image-pdf.pdf"
196 | 
197 | returns as JSON response identifying the document type
198 | 
199 |     {"mimetype": "PDF document, version 1.3"}
200 | 
201 | and
202 | 
203 |     curl 'http://localhost:5050/utils/mime-type/?mime=True' \
204 |      -X 'POST' \
205 |      -F "file=@doctor/test_assets/image-pdf.pdf"
206 | 
207 | returns as JSON response identifying the document type
208 | 
209 |     {"mimetype": "application/pdf"}
210 | 
211 | Another example  
212 | 
213 |     curl 'http://localhost:5050/utils/mime-type/?mime=True' \
214 |      -X 'POST' \
215 |      -F "file=@doctor/test_assets/word-doc.doc"
216 | 
217 | returns
218 | 
219 |     {"mimetype": "application/msword"}
220 | 
221 | This method is useful for identifying the type of document, incorrect documents and weird documents.
222 | 
223 | ### Endpoint: /utils/add/text/pdf/
224 | 
225 | This method will take an image PDF and return the PDF with transparent text overlayed on the document.
226 | This allows users to copy and paste (more or less) from our OCRd text.
227 | 
228 |     curl 'http://localhost:5050/utils/add/text/pdf/' \
229 |      -X 'POST' \
230 |      -F "file=@doctor/test_assets/image-pdf.pdf" \
231 |      -o image-pdf-with-embedded-text.pdf
232 | 
233 | ### Endpoint: /utils/audio/duration/
234 | 
235 | This endpoint returns the duration of an MP3 file.
236 | 
237 |     curl 'http://localhost:5050/utils/audio/duration/' \
238 |      -X 'POST' \
239 |      -F "file=@doctor/test_assets/1.mp3"
240 | 
241 | ### Endpoint: /utils/document-number/pdf/
242 | 
243 | This method takes a document from the federal filing system and returns its document entry number.
244 | 
245 |     curl 'http://localhost:5050/utils/document-number/pdf/' \
246 |      -X 'POST' \
247 |      -F "file=@doctor/test_assets/recap_documents/ca2_1-1.pdf"
248 | 
249 | This will return an HTTP response with the document number.  In the above example it would return __1-1__.
250 | 
251 | 
252 | ## Converters
253 | 
254 | ### Endpoint: /convert/image/pdf/
255 | 
256 | Given an image of indeterminate length, this endpoint will convert it to a pdf with reasonable page breaks. This is meant for extremely long images that represent multi-page documents, but can be used to convert a smaller image to a one-page PDF.
257 | 
258 |     curl 'http://localhost:5050/convert/image/pdf/' \
259 |      -X 'POST' \
260 |      -F "file=@doctor/test_assets/long-image.tiff" \
261 |       --output test-image-to-pdf.pdf
262 | 
263 | Keep in mind that this curl will write the file to the current directory.
264 | 
265 | ### Endpoint: /convert/images/pdf/
266 | 
267 | Given a list of urls for images, this endpoint will convert them to a pdf. This can be used to convert multiple images to a multi-page PDF. We use this to convert financial disclosure images to simple PDFs.
268 | 
269 |     curl 'http://localhost:5050/convert/images/pdf/?sorted_urls=%5B%22https%3A%2F%2Fcom-courtlistener-storage.s3-us-west-2.amazonaws.com%2Ffinancial-disclosures%2F2011%2FA-E%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11_Page_1.tiff%22%2C+%22https%3A%2F%2Fcom-courtlistener-storage.s3-us-west-2.amazonaws.com%2Ffinancial-disclosures%2F2011%2FA-E%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11%2FArmstrong-SB%2520J3.%252009.%2520CAN_R_11_Page_2.tiff%22%5D' \
270 |         -X POST \
271 |         -o image.pdf
272 | 
273 | This returns the binary data of the pdf.
274 | 
275 | 
276 | ### Endpoint: /convert/pdf/thumbnail/
277 | 
278 | Thumbnail takes a pdf and returns a png thumbnail of the first page.
279 | 
280 |     curl 'http://localhost:5050/convert/pdf/thumbnail/' \
281 |      -X 'POST' \
282 |      -F "file=@doctor/test_assets/image-pdf.pdf" \
283 |      -o test-thumbnail.png
284 | 
285 | This returns the binary data of the thumbnail.
286 | 
287 | Keep in mind that this curl will also write the file to the current directory.
288 | 
289 | ### Endpoint: /convert/pdf/thumbnails/
290 | 
291 | Given a PDF and a range or pages, this endpoint will return a zip file containing thumbnails
292 | for each page requested. This endpoint also takes an optional parameter called max_dimension,
293 | this property scales the long side of each thumbnail (width for landscape pages, height for
294 | portrait pages) to fit in the specified number of pixels.
295 | 
296 | For example if you want thumbnails for the first four pages:
297 | 
298 |     curl 'http://localhost:5050/convert/pdf/thumbnails/' \
299 |      -X 'POST' \
300 |      -F "file=@doctor/test_assets/vector-pdf.pdf" \
301 |      -F 'pages="[1,2,3,4]"' \
302 |      -F 'max_dimension=350' \
303 |      -o thumbnails.zip
304 | 
305 | This will return four thumbnails in a zip file.
306 | 
307 | ### Endpoint: /convert/audio/mp3/
308 | 
309 | This endpoint takes an audio file and converts it to an MP3 file.  This is used to convert different audio formats
310 | from courts across the country and standardizes the format for our end users.  
311 | 
312 | This endpoint also adds the SEAL of the court to the MP3 file and updates the metadata to reflect our updates.
313 | 
314 |     curl 'http://localhost:5050/convert/audio/mp3/?audio_data=%7B%22court_full_name%22%3A+%22Testing+Supreme+Court%22%2C+%22court_short_name%22%3A+%22Testing+Supreme+Court%22%2C+%22court_pk%22%3A+%22test%22%2C+%22court_url%22%3A+%22http%3A%2F%2Fwww.example.com%2F%22%2C+%22docket_number%22%3A+%22docket+number+1+005%22%2C+%22date_argued%22%3A+%222020-01-01%22%2C+%22date_argued_year%22%3A+%222020%22%2C+%22case_name%22%3A+%22SEC+v.+Frank+J.+Custable%2C+Jr.%22%2C+%22case_name_full%22%3A+%22case+name+full%22%2C+%22case_name_short%22%3A+%22short%22%2C+%22download_url%22%3A+%22http%3A%2F%2Fmedia.ca7.uscourts.gov%2Fsound%2Fexternal%2Fgw.15-1442.15-1442_07_08_2015.mp3%22%7D' \
315 |      -X 'POST' \
316 |      -F "file=@doctor/test_assets/1.wma"
317 | 
318 | This returns the audio file as a file response.
319 | 
320 | ### Endpoint: /convert/audio/ogg/
321 | 
322 | This endpoint takes an audio file and converts it to an OGG file. The conversion process downsizes files by using
323 | a single audio channel and fixing the sampling rate to 8 kHz.
324 | 
325 | This endpoint also optimizes the output for voice over IP applications.
326 | 
327 |     curl 'http://localhost:5050/convert/audio/ogg/' \
328 |      -X 'POST' \
329 |      -F "file=@doctor/test_assets/1.wma"
330 | 
331 | This returns the audio file as a file response.
332 | 
333 | 
334 | ## Testing
335 | 
336 | Testing is designed to be run with the `compose.yaml` file.  To see more about testing
337 | checkout the DEVELOPING.md file.
338 | 
339 | ## Sentry Logging
340 | 
341 | For debugging purposes, it's possible to set your Sentry DSN to send events to Sentry.
342 | By default, no SENTRY_DSN is set and no events will be sent to Sentry.
343 | To use Sentry set the SENTRY_DSN environment variable to your DSN. Using Docker you can set it with:
344 | 
345 |     docker run -d -p 5050:5050 -e SENTRY_DSN=<https://yout-sentry-dsn> freelawproject/doctor:latest
346 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | You can find our VDP here: https://free.law/vulnerability-disclosure-policy/
2 | 


--------------------------------------------------------------------------------
/compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   doctor:
 3 |     container_name: doctor
 4 |     build:
 5 |       dockerfile: docker/Dockerfile
 6 |       context: .
 7 |       args:
 8 |         options: --reload
 9 |     image: freelawproject/doctor:latest
10 |     ports:
11 |       - 5050:5050
12 |     volumes:
13 |       - .:/opt/app
14 |     env_file:
15 |       - .env.dev
16 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Note: Force M1 to emulate amd64
 2 | FROM --platform=linux/amd64 python:3.10
 3 | 
 4 | # Install uv
 5 | # https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
 6 | COPY --from=ghcr.io/astral-sh/uv:0.7 /uv /uvx /bin/
 7 | 
 8 | # Install apt dependencies
 9 | # caching: https://docs.docker.com/build/cache/optimize/#use-cache-mounts
10 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
11 |     --mount=type=cache,target=/var/lib/apt,sharing=locked \
12 |     apt-get update --quiet=2 && \
13 |     apt-get install -y --no-install-recommends apt-utils && \
14 |     apt-get install -y \
15 |         build-essential \
16 |         curl \
17 |         libjpeg-dev \
18 |         libleptonica-dev \
19 |         libtesseract-dev \
20 |         libz-dev \
21 |         poppler-utils \
22 |         qpdf \
23 |         tesseract-ocr \
24 |         && \
25 |     apt-get install \
26 |         --no-install-recommends \
27 |         --assume-yes \
28 |         --quiet=2 \
29 |         `# Document extraction and OCR tools` \
30 |         antiword \
31 |         docx2txt \
32 |         ghostscript \
33 |         libwpd-tools \
34 |         `# Audio extraction/manipulation tools` \
35 |         ffmpeg \
36 |         libmagic1 \
37 |         `# Image & OCR tools` \
38 |         imagemagick \
39 |         `# Other dependencies` \
40 |         libffi-dev \
41 |         libxml2-dev \
42 |         libxslt-dev
43 | 
44 | # set environment variables
45 | ENV PYTHONDONTWRITEBYTECODE=1 \
46 |     PYTHONUNBUFFERED=1 \
47 |     # Disable tesseract multithreading for more scalable performance and
48 |     # faster overall performance
49 |     OMP_THREAD_LIMIT=1
50 | 
51 | WORKDIR /code
52 | 
53 | # Install Python dependencies
54 | COPY pyproject.toml uv.lock .
55 | # https://docs.astral.sh/uv/guides/integration/docker/#caching
56 | ENV UV_COMPILE_BYTECODE=1 \
57 |     UV_LINK_MODE=copy \
58 |     UV_PROJECT_ENVIRONMENT=/venv \
59 |     PATH="/venv/bin:$PATH"
60 | RUN --mount=type=cache,target=/root/.cache/uv \
61 |     uv sync
62 | 
63 | COPY . .
64 | 
65 | EXPOSE 5050
66 | 
67 | ARG options
68 | ENV OPTIONS $options
69 | 
70 | CMD gunicorn $OPTIONS doctor.wsgi:application \
71 |       --workers ${DOCTOR_WORKERS:-1} \
72 |       --max-requests 1000 \
73 |       --max-requests-jitter 100 \
74 |       --timeout 5400 \
75 |       --bind 0.0.0.0:5050
76 | 


--------------------------------------------------------------------------------
/docker/Makefile:
--------------------------------------------------------------------------------
 1 | # Run with make push --file docker/Makefile -e VERSION=$(git rev-parse --short HEAD)
 2 | # Note that makefiles differentiate between tabs and spaces in a weird way!
 3 | 
 4 | # Ensure VERSION is set.
 5 | ifndef VERSION
 6 | $(error VERSION variable is not set. Use -e VERSION=XYZ to proceed.)
 7 | endif
 8 | 
 9 | DOCKER_REPOSITORY ?= freelawproject/doctor
10 | 
11 | DOCKER ?= docker
12 | export DOCKER
13 | 
14 | .PHONY: all image push multiarch_push multiarch_image
15 | 
16 | UNAME := $(shell uname -m)
17 | 
18 | all: image
19 | 
20 | image:
21 | 	$(DOCKER) build -t $(DOCKER_REPOSITORY):$(VERSION) -t $(DOCKER_REPOSITORY):latest --file docker/Dockerfile .
22 | 
23 | push: image
24 | 	$(info Checking if valid architecture)
25 | 	@if [ $(UNAME) = "x86_64" ]; then \
26 | 		echo "Architecture is OK. Pushing.";\
27 | 		$(DOCKER) push $(DOCKER_REPOSITORY):$(VERSION);\
28 |         $(DOCKER) push $(DOCKER_REPOSITORY):latest;\
29 | 	else \
30 | 		echo "Only arm64 machines can push single-architecture builds. If you want to \
31 | push a build, try 'make multiarch_push', which builds for both arm64 and amd64. This \
32 | protects against arm64 builds being accidentally deployed to the server (which uses arm64).";\
33 | 	fi
34 | 
35 | multiarch_image:
36 | 	export DOCKER_CLI_EXPERIMENTAL=enabled
37 | 	$(DOCKER) buildx rm
38 | 	$(DOCKER) buildx create --use --name flp-builder
39 | 	$(DOCKER) buildx build --platform linux/amd64,linux/arm64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile .
40 | 
41 | multiarch_push: multiarch_image
42 | 	$(DOCKER) buildx build --push --platform linux/amd64,linux/arm64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile .
43 | 
44 | x86_push:
45 | 	export DOCKER_CLI_EXPERIMENTAL=enabled
46 | 	$(DOCKER) buildx rm
47 | 	$(DOCKER) buildx create --use --name flp-builder
48 | 	$(DOCKER) buildx build --push --platform linux/amd64 -t $(DOCKER_REPOSITORY):latest -t $(DOCKER_REPOSITORY):$(VERSION) --file docker/Dockerfile .
49 | 


--------------------------------------------------------------------------------
/doctor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/__init__.py


--------------------------------------------------------------------------------
/doctor/assets/producer-300x300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/assets/producer-300x300.png


--------------------------------------------------------------------------------
/doctor/forms.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | import uuid
  4 | 
  5 | from django import forms
  6 | from django.core.exceptions import ValidationError
  7 | from django.core.validators import FileExtensionValidator
  8 | 
  9 | 
 10 | class BaseAudioFile(forms.Form):
 11 |     file = forms.FileField(label="document", required=True)
 12 | 
 13 | 
 14 | class BaseFileForm(forms.Form):
 15 |     """"""
 16 | 
 17 |     file = forms.FileField(label="document", required=True)
 18 | 
 19 |     def temp_save_file(self, fp):
 20 |         with open(fp, "wb") as f:
 21 |             for chunk in self.cleaned_data["file"].chunks():
 22 |                 f.write(chunk)
 23 | 
 24 |     def clean_file(self):
 25 |         file = self.cleaned_data.get("file", False)
 26 |         if not file:
 27 |             raise ValidationError("File is missing.")
 28 |         self.cleaned_data["extension"] = file.name.split(".")[-1]
 29 |         self.cleaned_data["original_filename"] = file.name
 30 |         self.prep_file()
 31 |         return file
 32 | 
 33 |     def prep_file(self):
 34 |         with tempfile.NamedTemporaryFile(
 35 |             delete=False, suffix=f".{self.cleaned_data['extension']}"
 36 |         ) as fp:
 37 |             self.cleaned_data["tmp_dir"] = tempfile.TemporaryDirectory()
 38 |             self.cleaned_data["fp"] = fp.name
 39 |             self.temp_save_file(fp.name)
 40 | 
 41 | 
 42 | class AudioForm(BaseAudioFile):
 43 |     """"""
 44 | 
 45 |     audio_data = forms.JSONField(label="audio-data", required=False)
 46 | 
 47 |     def clean(self):
 48 |         self.cleaned_data["fp"] = f"/tmp/audio_{uuid.uuid4().hex}"
 49 |         if self.cleaned_data.get("file", None):
 50 |             filename = self.cleaned_data["file"].name
 51 |             self.cleaned_data["extension"] = filename.split(".")[-1]
 52 |         return self.cleaned_data
 53 | 
 54 | 
 55 | class ImagePdfForm(forms.Form):
 56 |     sorted_urls = forms.CharField(required=True, label="sorted-urls")
 57 | 
 58 |     def clean(self):
 59 |         self.cleaned_data["sorted_urls"] = json.loads(
 60 |             self.cleaned_data["sorted_urls"]
 61 |         )
 62 |         return self.cleaned_data
 63 | 
 64 | 
 65 | class MimeForm(forms.Form):
 66 |     file = forms.FileField(label="document", required=False)
 67 |     mime = forms.BooleanField(label="mime", required=False)
 68 | 
 69 |     def clean(self):
 70 |         file = self.cleaned_data.get("file", False)
 71 |         if not file:
 72 |             raise ValidationError("File is missing.")
 73 | 
 74 |         self.cleaned_data["filename"] = "unknown"
 75 | 
 76 | 
 77 | class ThumbnailForm(forms.Form):
 78 |     file = forms.FileField(
 79 |         label="document",
 80 |         required=True,
 81 |         validators=[FileExtensionValidator(["pdf"])],
 82 |     )
 83 |     max_dimension = forms.IntegerField(label="max-dimension", required=False)
 84 |     pages = forms.Field(label="pages", required=False)
 85 | 
 86 |     def clean(self):
 87 |         """"""
 88 |         if self.cleaned_data.get("pages"):
 89 |             self.cleaned_data["pages"] = json.loads(self.cleaned_data["pages"])
 90 | 
 91 |         if not self.cleaned_data["max_dimension"]:
 92 |             self.cleaned_data["max_dimension"] = 350
 93 |         return self.cleaned_data
 94 | 
 95 | 
 96 | class DocumentForm(BaseFileForm):
 97 |     ocr_available = forms.BooleanField(label="ocr-available", required=False)
 98 |     mime = forms.BooleanField(label="mime", required=False)
 99 |     strip_margin = forms.BooleanField(label="strip-margin", required=False)
100 | 
101 |     def clean(self):
102 |         self.clean_file()
103 |         return self.cleaned_data
104 | 


--------------------------------------------------------------------------------
/doctor/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/lib/__init__.py


--------------------------------------------------------------------------------
/doctor/lib/mojibake.py:
--------------------------------------------------------------------------------
  1 | from django.utils.encoding import smart_str
  2 | 
  3 | 
  4 | def fix_mojibake(text):
  5 |     """Given corrupt text from pdffactory, converts it to sane text."""
  6 | 
  7 |     letter_map = {
  8 |         "¿": "a",
  9 |         "¾": "b",
 10 |         "½": "c",
 11 |         "¼": "d",
 12 |         "»": "e",
 13 |         "º": "f",
 14 |         "¹": "g",
 15 |         "¸": "h",
 16 |         "·": "i",
 17 |         "¶": "j",
 18 |         "μ": "k",
 19 |         "´": "l",
 20 |         "³": "m",
 21 |         "²": "n",
 22 |         "±": "o",
 23 |         "°": "p",
 24 |         "¯": "q",
 25 |         "®": "r",
 26 |         "-": "s",
 27 |         "¬": "t",
 28 |         "«": "u",
 29 |         "ª": "v",
 30 |         "©": "w",
 31 |         "¨": "x",
 32 |         "§": "y",
 33 |         "¦": "z",
 34 |         "ß": "A",
 35 |         "Þ": "B",
 36 |         "Ý": "C",
 37 |         "Ü": "D",
 38 |         "Û": "E",
 39 |         "Ú": "F",
 40 |         "Ù": "G",
 41 |         "Ø": "H",
 42 |         "×": "I",
 43 |         "Ö": "J",
 44 |         "Õ": "K",
 45 |         "Ô": "L",
 46 |         "Ó": "M",
 47 |         "Ò": "N",
 48 |         "Ñ": "O",
 49 |         "Ð": "P",
 50 |         "Î": "R",
 51 |         "Í": "S",
 52 |         "Ì": "T",
 53 |         "Ë": "U",
 54 |         "Ê": "V",
 55 |         "É": "W",
 56 |         "": "X",  # Missing
 57 |         "Ç": "Y",
 58 |         "Æ": "Z",
 59 |         "ð": "0",
 60 |         "ï": "1",
 61 |         "î": "2",
 62 |         "í": "3",
 63 |         "ì": "4",
 64 |         "ë": "5",
 65 |         "ê": "6",
 66 |         "é": "7",
 67 |         "è": "8",
 68 |         "ç": "9",
 69 |         "ò": ".",
 70 |         "ô": ",",
 71 |         "æ": ":",
 72 |         "å": ";",
 73 |         "Ž": "'",
 74 |         "•": "'",  # s/b double quote, but identical to single.
 75 |         "Œ": "'",  # s/b double quote, but identical to single.
 76 |         "ó": "-",  # dash
 77 |         "Š": "-",  # n-dash
 78 |         "‰": "--",  # em-dash
 79 |         "ú": "&",
 80 |         "ö": "*",
 81 |         "ñ": "/",
 82 |         "÷": ")",
 83 |         "ø": "(",
 84 |         "Å": "[",
 85 |         "Ã": "]",
 86 |         "‹": "•",
 87 |     }
 88 | 
 89 |     plaintext = ""
 90 |     for letter in text:
 91 |         try:
 92 |             plaintext += letter_map[letter]
 93 |         except KeyError:
 94 |             try:
 95 |                 plaintext += smart_str(letter)
 96 |             except UnicodeEncodeError:
 97 |                 continue
 98 | 
 99 |     return plaintext
100 | 


--------------------------------------------------------------------------------
/doctor/lib/text_extraction.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pandas as pd
  4 | import pdfplumber
  5 | import pytesseract
  6 | from pdfplumber.ctm import CTM
  7 | from PIL import Image
  8 | from pytesseract import Output
  9 | 
 10 | 
 11 | def is_skewed(obj: dict) -> bool:
 12 |     """Check if a PDF plumber dict is skewed
 13 | 
 14 |     CTM stands for current transformation matrix.
 15 |     Pdf plumber has a method to calculate the angle of text which we use here
 16 | 
 17 |     Traditionally this is only seen in circular stamps which confuses the
 18 |     content, or in perpendicular text of the ninth circuit courts which also
 19 |     confuses the text.
 20 | 
 21 |     :param obj: dictionary from pdfplumber for each word
 22 |     :return: if the text should be returned
 23 |     """
 24 |     if (matrix := obj.get("matrix")) is None:
 25 |         return True
 26 | 
 27 |     # Remove Skew
 28 |     my_char_ctm = CTM(*matrix)
 29 |     return my_char_ctm.skew_x == 0
 30 | 
 31 | 
 32 | def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
 33 |     """Extract page text
 34 | 
 35 |     Using pdf plumber extract out the text of the document that is not
 36 |     skewed (ie a stamp of approval) and extract out text removing blue text
 37 | 
 38 |     Strip margin refers only to top and bottom margin here
 39 | 
 40 |     :param page: PdfPlumber page
 41 |     :param strip_margin: a flag to crop out the margin of a document and skewed content
 42 |     :return: Text from the pdf plumber page
 43 |     """
 44 |     _, _, width, height = page.bbox
 45 |     if strip_margin and (height > width):
 46 |         # Crop margins and remove skewed text
 47 |         pixels_per_inch = width / 8.5
 48 |         bbox = (
 49 |             0,
 50 |             pixels_per_inch * 1,  # 1 inch down from top
 51 |             width,  #
 52 |             pixels_per_inch * 10,  # 10 inches from top (1 inch from bottom)
 53 |         )
 54 |         page_text = (
 55 |             page.crop(bbox)
 56 |             .filter(is_skewed)
 57 |             .extract_text(
 58 |                 layout=True,
 59 |                 keep_blank_chars=True,
 60 |                 y_tolerance=5,
 61 |                 y_density=25,
 62 |             )
 63 |         )
 64 |     else:
 65 |         page_text = page.extract_text(
 66 |             layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
 67 |         )
 68 |     page_text = remove_excess_whitespace(page_text)
 69 |     return page_text
 70 | 
 71 | 
 72 | def has_images(page: pdfplumber.pdf.Page) -> bool:
 73 |     """Does the page have images that are large enough to contain text
 74 | 
 75 |     :param page: pdf plumber page
 76 |     :return: True if page contains images of a certain size
 77 |     """
 78 |     return any(
 79 |         image
 80 |         for image in page.images
 81 |         if image["width"] > 10 and image["height"] > 10
 82 |     )
 83 | 
 84 | 
 85 | def has_text_annotations(page: pdfplumber.pdf.Page) -> bool:
 86 |     """Does the page have annotations which could contain text
 87 | 
 88 |     :param page: pdf plumber
 89 |     :return: if page has annotations
 90 |     """
 91 |     if page.annots:
 92 |         anno_types = [
 93 |             str(annot.get("data").get("Subtype")) for annot in page.annots
 94 |         ]
 95 |         if "/'FreeText'" in anno_types or "/'Widget'" in anno_types:
 96 |             return True
 97 |     return False
 98 | 
 99 | 
100 | def adjust_caption_lines(page_text: str) -> str:
101 |     """Adjust the alignment of ) or : or § used to align content
102 | 
103 |     § is used in texas courts
104 |     : is used in NY courts
105 |     ) is used in many courts
106 | 
107 |     :param page_text: The text of the first page
108 |     :return: The page text
109 |     """
110 |     for separator in [r")", "§", ":"]:
111 |         pattern = rf"(.* +{re.escape(separator)} .*\n)"
112 |         matches = list(re.finditer(pattern, page_text))
113 |         central_matches = [
114 |             match.group().rindex(separator)
115 |             for match in matches
116 |             if 30 <= match.group().rindex(separator) <= 70
117 |         ]
118 |         if len(central_matches) < 3:
119 |             continue  # Skip this separator if less than 3 matches found
120 |         # Determine the longest position of the separator
121 |         longest = max(central_matches)
122 |         page = []
123 |         for row in page_text.splitlines():
124 |             index = row.find(f" {separator}")
125 |             addition = (longest - index) * " "
126 |             row = row.replace(f" {separator}", f"{addition}{separator}")
127 |             page.append(row)
128 |         return "\n".join(page)
129 |     return page_text
130 | 
131 | 
132 | def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:
133 |     """Does the page need OCR
134 | 
135 |     :param page:Pdf Plumber Page
136 |     :param page_text: context extracted from page
137 |     :return: does page need OCR
138 |     """
139 |     return (
140 |         page_text.strip() == ""
141 |         or "(cid:" in page_text
142 |         or has_text_annotations(page)
143 |         or has_images(page)
144 |         or len(page.curves) > 10
145 |     )
146 | 
147 | 
148 | def convert_pdf_page_to_image(
149 |     page: pdfplumber.pdf.Page, strip_margin: bool
150 | ) -> Image:
151 |     """Convert page to image and crop margin if applicable
152 | 
153 |     :param page: the pdf page
154 |     :param strip_margin: whether to crop the margin
155 |     :return: The cropped page image
156 |     """
157 |     img = page.to_image(resolution=300)
158 |     _, _, w, h = page.bbox
159 |     width = w * img.scale
160 | 
161 |     if strip_margin:
162 |         pixels_per_inch = width / 8.5
163 |         bbox = (
164 |             pixels_per_inch * 0.5,  # .5"  from left edge
165 |             pixels_per_inch * 0.5,  # .5" down from top
166 |             pixels_per_inch * 8,  # 8" from left edge (.5" from right)
167 |             pixels_per_inch * 10.5,  # 10.5" from top (.5" from bottom)
168 |         )
169 |         image = img.original.crop(bbox)
170 |     else:
171 |         image = img.original
172 |     return image
173 | 
174 | 
175 | def ocr_image_to_data(image: Image) -> list[pd.DataFrame]:
176 |     """Perform OCR on an image to extract data
177 | 
178 |     Convert the image of the pdf page to OCR data
179 |     :param image: Pil Image
180 |     :return: A list of DataFrames, each containing OCR data for a block of text
181 |     """
182 | 
183 |     #  Detailed Parameters for `pytesseract.image_to_data`:
184 |     #  - config: str
185 |     #      Additional Tesseract configuration options.
186 |     #      - `-c preserve_interword_spaces=1`: Preserve spaces between words as they appear in the image.
187 |     #      - `-c tessedit_do_invert=0`: Do not invert the image colors.
188 |     #      - `--psm 6`: Page segmentation mode 6, which assumes a single uniform block of text.
189 |     #      - `-l eng`: Use the English language for OCR.
190 |     #  - output_type: pytesseract.Output.DICT
191 |     #      Specifies that the output should be a dictionary of OCR data.
192 |     #
193 |     #  Reference:
194 |     #  Tesseract OCR documentation: https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc
195 | 
196 |     data_dict = pytesseract.image_to_data(
197 |         image,
198 |         config="-c preserve_interword_spaces=1x1 -c tessedit_do_invert=0 --psm 6 -l eng",
199 |         output_type=Output.DICT,
200 |     )
201 |     df = pd.DataFrame(data_dict)
202 |     filtered_data = df[(df.conf != -1)]
203 |     block_ids = (
204 |         filtered_data.groupby("block_num")
205 |         .first()
206 |         .sort_values("top")
207 |         .index.tolist()
208 |     )
209 |     blocks = [
210 |         filtered_data[filtered_data["block_num"] == block]
211 |         for block in block_ids
212 |     ]
213 |     return blocks
214 | 
215 | 
216 | def extract_with_ocr(page: pdfplumber.pdf.Page, strip_margin: bool) -> str:
217 |     """Extract the page using OCR
218 | 
219 |     :param page:Pdf Plumber Page
220 |     :param strip_margin: If we should trim the margins
221 |     :return: The extracted content for the page
222 |     """
223 | 
224 |     image = convert_pdf_page_to_image(page, strip_margin)
225 |     data = ocr_image_to_data(image)
226 |     content = ""
227 |     prev = {}
228 |     for words in data:
229 |         for _, word in words.iterrows():
230 |             content = insert_whitespace(content, word, prev)
231 |             content += get_word(word, image.size[0], strip_margin)
232 |             prev = word
233 |     content = cleanup_content(content, page.page_number)
234 |     return content
235 | 
236 | 
237 | def insert_whitespace(content: str, word: dict, prev: dict) -> str:
238 |     """Insert whitespace after or before word
239 | 
240 |     :param content: The text extracted so far
241 |     :param word: The OCR extraction object
242 |     :param prev: The previous word object extracted
243 |     :return: The content with the whitespace appended
244 |     """
245 |     is_new_line = prev.get("line_num", 0) != word["line_num"]
246 |     is_new_par = prev.get("par_num", 0) != word["par_num"]
247 |     prev_end = prev.get("left", 1) + prev.get("width", 1)
248 | 
249 |     # Add vertical whitespace
250 |     if is_new_line or is_new_par:
251 |         vertical_gap = word["top"] - (
252 |             prev.get("top", 0) + prev.get("height", 0)
253 |         )
254 |         content += "\n\n" if vertical_gap > 100 else "\n"
255 |         prev_end = 0
256 | 
257 |     # add horizontal whitespace
258 |     content += " " * int((word["left"] - prev_end) / 25)
259 |     return content
260 | 
261 | 
262 | def get_word(word_dict: dict, width: float, strip_margin: bool) -> str:
263 |     """Append word to content
264 | 
265 |     This function determines if a word should be added to the page content
266 |     and adds the word.
267 | 
268 |     :param word_dict: the word object from tesseract
269 |     :param width: The width of the document
270 |     :param strip_margin: should we strip the margin
271 |     :return: The text with space
272 |     """
273 |     pixels_per_inch = width / 8.5
274 |     if strip_margin:
275 |         left_margin = 1 * pixels_per_inch  #
276 |         right_margin = 7.5 * pixels_per_inch
277 |     else:
278 |         left_margin = 0.5 * pixels_per_inch
279 |         right_margin = 8.0 * pixels_per_inch
280 | 
281 |     # tesseract provides confidence values for its OCR outputs. We use those
282 |     # confidence values to determine if something is a good OCR output, a
283 |     # likely artifact and should be excluded or is bad ocr but not an artifact.
284 | 
285 |     word = word_dict["text"]
286 |     conf = word_dict["conf"]
287 | 
288 |     no_confidence = 0
289 |     very_low_confidence = 5
290 |     low_confidence = 40
291 |     short_word_len = 3
292 |     long_word_len = 20
293 |     if (
294 |         word_dict["left"] + word_dict["width"] < left_margin
295 |         and conf < low_confidence
296 |     ):
297 |         # If a word has confidence below 40, a number that usually equates to 3 to 5
298 |         # standard deviations from confidences found in other words is entirely in the
299 |         # margin of the page - its likely an artifact as well.
300 |         word = " " * len(word)
301 |     elif (conf == no_confidence and len(word) <= short_word_len) or word_dict[
302 |         "left"
303 |     ] == 0:
304 |         # If a word has a zero confidence or starts on the left most edge of the paper
305 |         # we return it as an empty string. It is likely an artifact.
306 |         word = " " * len(word)
307 |     elif conf < very_low_confidence and (
308 |         len(word) <= short_word_len or len(word) > long_word_len
309 |     ):
310 |         # If a confidence is below 5 - for a very short word - or for a very long word
311 |         # its likely part of the document but we have no idea so we return a square
312 |         # box to indicate that. This is often caused by stamps or lines in case captions
313 |         word = "□" * len(word)
314 |     elif conf < low_confidence and word_dict["left"] > right_margin:
315 |         # Finally if a low confidence word starts in the right margin - its likely a
316 |         # bad OCR that is multiple standard deviations away so we return the word as
317 |         # empty squares.
318 |         word = "□" * len(word)
319 | 
320 |     return f"{word} "
321 | 
322 | 
323 | def cleanup_content(content: str, page_number: int) -> str:
324 |     """Reduce legal document line clutter
325 | 
326 |     This function performs several operations to clean up the text extracted from legal documents:
327 | 
328 |     1. On the first page, it smooths out vertical lines if they are detected.
329 |     2. It removes pipes ('|') that might start a line repeatedly.
330 |     3. It removes artifacts that appear at the end of a line of text, specifically single characters
331 |        following at least 10 whitespace characters, reducing right margin edge artifacts.
332 |     4. It removes excess left margin whitespace to improve readability and formatting.
333 | 
334 |     Example:
335 |     If the pipes below represent the page edge (not characters):
336 |     |       we can remove the
337 |     |    the left whitespace
338 |     |    and shift this entire
339 |     |    page over four characters
340 |     |    which keeps formatting and
341 |     |    makes the text easier to
342 |     |    read and process with the API.
343 | 
344 |     :param content: the page content extracted
345 |     :param page_number: the page number
346 |     :return: the cleaned up text
347 |     """
348 |     # remove floating pipes
349 |     pattern = r"\s{4,}\| $"
350 |     # Substitute the matched pipe with an empty string
351 |     content = re.sub(pattern, "", content, flags=re.MULTILINE)
352 | 
353 |     # remove floating artifacts from the right side
354 |     pattern = r"\s{10,}[a-zA-Z0-9|] $"
355 |     content = re.sub(pattern, "", content, flags=re.MULTILINE)
356 | 
357 |     # shift text left if possible and remove excess start and end whitespace
358 |     content = remove_excess_whitespace(content)
359 |     if page_number == 1:
360 |         content = adjust_caption_lines(content)
361 | 
362 |     return f"{content}\n"
363 | 
364 | 
365 | def remove_excess_whitespace(document: str) -> str:
366 |     """Remove excess whitespace from OCR
367 | 
368 |     This function removes empty lines of text at the start and end of a document
369 |     and shifts the page left if possible
370 | 
371 |     :param document: text of the document
372 |     :return: Document with excess whitespace removed
373 |     """
374 |     m = re.findall(r"(^ +)", document, re.MULTILINE)
375 |     if m:
376 |         shift_left = len(min(m))
377 |         pattern = f"(^ {{{shift_left}}})"
378 |         document = re.sub(pattern, "", document, flags=re.MULTILINE)
379 |     document = re.sub(r"^ +$", "", document, flags=re.MULTILINE)
380 |     return document.strip("\n")
381 | 


--------------------------------------------------------------------------------
/doctor/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import io
  3 | import logging
  4 | import os
  5 | import re
  6 | import subprocess
  7 | import warnings
  8 | from collections import namedtuple
  9 | from decimal import Decimal
 10 | from pathlib import Path
 11 | from typing import Any
 12 | 
 13 | import six
 14 | from PyPDF2 import PdfMerger
 15 | from reportlab.pdfgen import canvas
 16 | 
 17 | 
 18 | class DoctorUnicodeDecodeError(UnicodeDecodeError):
 19 |     def __init__(self, obj, *args):
 20 |         self.obj = obj
 21 |         UnicodeDecodeError.__init__(self, *args)
 22 | 
 23 |     def __str__(self):
 24 |         original = UnicodeDecodeError.__str__(self)
 25 |         return f"{original}. You passed in {self.obj!r} ({type(self.obj)})"
 26 | 
 27 | 
 28 | def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
 29 |     """
 30 |     Similar to smart_bytes, except that lazy instances are resolved to
 31 |     strings, rather than kept as lazy objects.
 32 | 
 33 |     If strings_only is True, don't convert (some) non-string-like objects.
 34 |     """
 35 |     # Handle the common case first for performance reasons.
 36 |     if isinstance(s, bytes):
 37 |         if encoding == "utf-8":
 38 |             return s
 39 |         else:
 40 |             return s.decode("utf-8", errors).encode(encoding, errors)
 41 |     if strings_only and is_protected_type(s):
 42 |         return s
 43 |     if isinstance(s, six.memoryview):
 44 |         return bytes(s)
 45 |     if isinstance(s, Promise):
 46 |         return six.text_type(s).encode(encoding, errors)
 47 |     if not isinstance(s, six.string_types):
 48 |         try:
 49 |             if six.PY3:
 50 |                 return six.text_type(s).encode(encoding)
 51 |             else:
 52 |                 return bytes(s)
 53 |         except UnicodeEncodeError:
 54 |             if isinstance(s, Exception):
 55 |                 # An Exception subclass containing non-ASCII data that doesn't
 56 |                 # know how to print itself properly. We shouldn't raise a
 57 |                 # further exception.
 58 |                 return b" ".join(
 59 |                     force_bytes(arg, encoding, strings_only, errors)
 60 |                     for arg in s
 61 |                 )
 62 |             return six.text_type(s).encode(encoding, errors)
 63 |     else:
 64 |         return s.encode(encoding, errors)
 65 | 
 66 | 
 67 | def force_text(s, encoding="utf-8", strings_only=False, errors="strict"):
 68 |     """
 69 |     Similar to smart_text, except that lazy instances are resolved to
 70 |     strings, rather than kept as lazy objects.
 71 | 
 72 |     If strings_only is True, don't convert (some) non-string-like objects.
 73 |     """
 74 |     # Handle the common case first for performance reasons.
 75 |     if issubclass(type(s), six.text_type):
 76 |         return s
 77 |     if strings_only and is_protected_type(s):
 78 |         return s
 79 |     try:
 80 |         if not issubclass(type(s), six.string_types):
 81 |             if six.PY3:
 82 |                 if isinstance(s, bytes):
 83 |                     s = six.text_type(s, encoding, errors)
 84 |                 else:
 85 |                     s = six.text_type(s)
 86 |             elif hasattr(s, "__unicode__"):
 87 |                 s = six.text_type(s)
 88 |             else:
 89 |                 s = six.text_type(bytes(s), encoding, errors)
 90 |         else:
 91 |             # Note: We use .decode() here, instead of six.text_type(s, encoding,
 92 |             # errors), so that if s is a SafeBytes, it ends up being a
 93 |             # SafeText at the end.
 94 |             s = s.decode(encoding, errors)
 95 |     except UnicodeDecodeError as e:
 96 |         if not isinstance(s, Exception):
 97 |             raise DoctorUnicodeDecodeError(s, *e.args)
 98 |         else:
 99 |             # If we get to here, the caller has passed in an Exception
100 |             # subclass populated with non-ASCII bytestring data without a
101 |             # working unicode method. Try to handle this without raising a
102 |             # further exception by individually forcing the exception args
103 |             # to unicode.
104 |             s = " ".join(
105 |                 force_text(arg, encoding, strings_only, errors) for arg in s
106 |             )
107 |     return s
108 | 
109 | 
110 | def smart_text(s, encoding="utf-8", strings_only=False, errors="strict"):
111 |     """
112 |     Returns a text object representing 's' -- unicode on Python 2 and str on
113 |     Python 3. Treats bytestrings using the 'encoding' codec.
114 | 
115 |     If strings_only is True, don't convert (some) non-string-like objects.
116 |     """
117 |     if isinstance(s, Promise):
118 |         # The input is the result of a gettext_lazy() call.
119 |         return s
120 |     return force_text(s, encoding, strings_only, errors)
121 | 
122 | 
123 | class Promise:
124 |     """
125 |     This is just a base class for the proxy class created in
126 |     the closure of the lazy function. It can be used to recognize
127 |     promises in code.
128 |     """
129 | 
130 |     pass
131 | 
132 | 
133 | _PROTECTED_TYPES = six.integer_types + (
134 |     type(None),
135 |     float,
136 |     Decimal,
137 |     datetime.datetime,
138 |     datetime.date,
139 |     datetime.time,
140 | )
141 | 
142 | 
143 | def is_protected_type(obj):
144 |     """Determine if the object instance is of a protected type.
145 | 
146 |     Objects of protected types are preserved as-is when passed to
147 |     force_text(strings_only=True).
148 |     """
149 |     return isinstance(obj, _PROTECTED_TYPES)
150 | 
151 | 
152 | def audio_encoder(data):
153 |     return namedtuple("AudioFile", data.keys())(*data.values())
154 | 
155 | 
156 | def ignore_warnings(test_func):
157 |     def do_test(self, *args, **kwargs):
158 |         with warnings.catch_warnings():
159 |             warnings.simplefilter("ignore", ResourceWarning)
160 |             warnings.simplefilter("ignore", DeprecationWarning)
161 |             test_func(self, *args, **kwargs)
162 | 
163 |     return do_test
164 | 
165 | 
166 | def make_png_thumbnail_for_instance(filepath, max_dimension):
167 |     """Abstract function for making a thumbnail for a PDF
168 | 
169 |     See helper functions below for how to use this in a simple way.
170 | 
171 |     :param filepath: The attr where the PDF is located on the item
172 |     :param max_dimension: The longest you want any edge to be
173 |     :param response: Flask response object
174 |     """
175 |     command = [
176 |         "pdftoppm",
177 |         "-singlefile",
178 |         "-f",
179 |         "1",
180 |         "-scale-to",
181 |         str(max_dimension),
182 |         filepath,
183 |         "-png",
184 |     ]
185 |     p = subprocess.Popen(
186 |         command, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
187 |     )
188 |     stdout, stderr = p.communicate()
189 |     return stdout, stderr.decode("utf-8"), str(p.returncode)
190 | 
191 | 
192 | def make_png_thumbnails(filepath, max_dimension, pages, directory):
193 |     """Abstract function for making a thumbnail for a PDF
194 | 
195 |     See helper functions below for how to use this in a simple way.
196 | 
197 |     :param filepath: The attr where the PDF is located on the item
198 |     :param max_dimension: The longest you want any edge to be
199 |     :param response: Flask response object
200 |     """
201 |     for page in pages:
202 |         command = [
203 |             "pdftoppm",
204 |             "-singlefile",
205 |             "-f",
206 |             str(page),
207 |             "-scale-to",
208 |             str(max_dimension),
209 |             filepath,
210 |             "-png",
211 |             f"{directory.name}/thumb-{page}",
212 |         ]
213 |         p = subprocess.Popen(
214 |             command,
215 |             close_fds=True,
216 |             stdout=subprocess.PIPE,
217 |             stderr=subprocess.PIPE,
218 |         )
219 |         p.communicate()
220 | 
221 | 
222 | def pdf_bytes_from_image_array(image_list, output_path) -> None:
223 |     """Make a pdf given an array of Image files
224 | 
225 |     :param image_list: List of images
226 |     :type image_list: list
227 |     :return: pdf_data
228 |     :type pdf_data: PDF as bytes
229 |     """
230 |     image_list[0].save(
231 |         output_path,
232 |         "PDF",
233 |         resolution=100.0,
234 |         save_all=True,
235 |         append_images=image_list[1:],
236 |     )
237 |     del image_list
238 | 
239 | 
240 | def strip_metadata_from_path(file_path):
241 |     """Convert PDF file into PDF and remove metadata from it
242 | 
243 |     Stripping the metadata allows us to hash the PDFs
244 | 
245 |     :param pdf_bytes: PDF as binary content
246 |     :return: PDF bytes with metadata removed.
247 |     """
248 |     with open(file_path, "rb") as f:
249 |         pdf_merger = PdfMerger()
250 |         pdf_merger.append(io.BytesIO(f.read()))
251 |         pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""})
252 |         byte_writer = io.BytesIO()
253 |         pdf_merger.write(byte_writer)
254 |         return force_bytes(byte_writer.getvalue())
255 | 
256 | 
257 | def strip_metadata_from_bytes(pdf_bytes):
258 |     """Convert PDF bytes into PDF and remove metadata from it
259 | 
260 |     Stripping the metadata allows us to hash the PDFs
261 | 
262 |     :param pdf_bytes: PDF as binary content
263 |     :return: PDF bytes with metadata removed.
264 |     """
265 |     pdf_merger = PdfMerger()
266 |     pdf_merger.append(io.BytesIO(pdf_bytes))
267 |     pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""})
268 |     byte_writer = io.BytesIO()
269 |     pdf_merger.write(byte_writer)
270 |     return force_bytes(byte_writer.getvalue())
271 | 
272 | 
273 | def cleanup_form(form):
274 |     """Clean up a form object"""
275 |     os.remove(form.cleaned_data["fp"])
276 | 
277 | 
278 | def make_file(filename, dir=None):
279 |     filepath = f"{Path.cwd()}/doctor/test_assets/{filename}"
280 |     with open(filepath, "rb") as f:
281 |         return {"file": (filename, f.read())}
282 | 
283 | 
284 | def make_buffer(filename, dir=None):
285 |     filepath = f"{Path.cwd()}/doctor/test_assets/{filename}"
286 |     with open(filepath, "rb") as f:
287 |         return {"file": ("filename", f.read())}
288 | 
289 | 
290 | def pdf_has_images(path: str) -> bool:
291 |     """Check raw PDF for embedded images.
292 | 
293 |     We need to check if a PDF contains any images.  If a PDF contains images it
294 |     likely has content that needs to be scanned.
295 | 
296 |     :param path: Location of PDF to process.
297 |     :return: Does the PDF contain images?
298 |     :type: bool
299 |     """
300 |     with open(path, "rb") as pdf_file:
301 |         pdf_bytes = pdf_file.read()
302 |         return bool(re.search(rb"/Image ?", pdf_bytes))
303 | 
304 | 
305 | def ocr_needed(path: str, content: str) -> bool:
306 |     """Check if OCR is needed on a PDF
307 | 
308 |     Check if images are in PDF or content is empty.
309 | 
310 |     :param path: The path to the PDF
311 |     :param content: The content extracted from the PDF.
312 |     :return: Whether OCR should be run on the document.
313 |     """
314 |     return content.strip() == "" or pdf_has_images(path)
315 | 
316 | 
317 | def make_page_with_text(page, data, h, w):
318 |     """Make a page with text
319 | 
320 |     :param page:
321 |     :param data:
322 |     :param h:
323 |     :param w:
324 |     :return:
325 |     """
326 |     packet = io.BytesIO()
327 |     can = canvas.Canvas(packet, pagesize=(w, h))
328 |     # Set to a standard size and font for now.
329 |     can.setFont("Helvetica", 9)
330 |     # Make the text transparent
331 |     can.setFillAlpha(0)
332 |     for i in range(len(data["level"])):
333 |         try:
334 |             letter, (x, y, _, hh), pg = (
335 |                 data["text"][i],
336 |                 (
337 |                     data["left"][i],
338 |                     data["top"][i],
339 |                     data["width"][i],
340 |                     data["height"][i],
341 |                 ),
342 |                 data["page_num"][i],
343 |             )
344 |         except Exception:
345 |             continue
346 |         # Adjust the text to an 8.5 by 11 inch page
347 |         sub = ((11 * 72) / h) * int(hh)
348 |         x = ((8.5 * 72) / w) * int(x)
349 |         y = ((11 * 72) / h) * int(y)
350 |         yy = (11 * 72) - y
351 |         if int(page) == int(pg):
352 |             can.drawString(x, yy - sub, letter)
353 |     can.showPage()
354 |     can.save()
355 |     packet.seek(0)
356 |     return packet
357 | 
358 | 
359 | def log_sentry_event(
360 |     logger: logging.Logger,
361 |     level: int,
362 |     message: str,
363 |     extra: dict[str, Any] | None = None,
364 |     **kwargs: Any,
365 | ) -> None:
366 |     """
367 |     Logs a message using a specified logger, level, message, and optional extra data.
368 | 
369 |     :param logger: The logger instance to use (e.g., logging.getLogger(__name__)).
370 |     :param level: The logging level (e.g., logging.INFO, logging.WARNING, logging.ERROR).
371 |     :param message: The message string to log.
372 |     :param extra: A dictionary containing extra data to attach to the log record.
373 |     :param kwargs: Additional keyword arguments passed to logger.log(), such as exc_info=True.
374 |     :return: None
375 |     """
376 |     logger.log(level, message, extra=extra, **kwargs)
377 | 


--------------------------------------------------------------------------------
/doctor/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Django settings for doctor project.
 3 | 
 4 | Generated by 'django-admin startproject' using Django 4.0.3.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/4.0/topics/settings/
 8 | 
 9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/4.0/ref/settings/
11 | """
12 | 
13 | from pathlib import Path
14 | 
15 | import environ
16 | import sentry_sdk
17 | from sentry_sdk.integrations.django import DjangoIntegration
18 | 
19 | env = environ.FileAwareEnv()
20 | 
21 | BASE_DIR = Path(__file__).resolve().parent.parent
22 | DEBUG = env.bool("DEBUG", default=False)
23 | SECRET_KEY = "this-is-a-not-so-secret-key"
24 | ALLOWED_HOSTS = ["doctor", "0.0.0.0", "localhost"]
25 | INSTALLED_APPS = []
26 | ROOT_URLCONF = "doctor.urls"
27 | WSGI_APPLICATION = "doctor.wsgi.application"
28 | 
29 | 
30 | SENTRY_DSN = env("SENTRY_DSN", default="")
31 | if SENTRY_DSN:
32 |     sentry_sdk.init(
33 |         dsn=SENTRY_DSN,
34 |         integrations=[
35 |             DjangoIntegration(),
36 |         ],
37 |         ignore_errors=[KeyboardInterrupt],
38 |     )
39 | 


--------------------------------------------------------------------------------
/doctor/tasks.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import io
  4 | import os
  5 | import re
  6 | import subprocess
  7 | from collections.abc import ByteString
  8 | from tempfile import NamedTemporaryFile
  9 | from typing import Any, AnyStr
 10 | 
 11 | import eyed3
 12 | import magic
 13 | import pdfplumber
 14 | import requests
 15 | import xray
 16 | from eyed3 import id3
 17 | from lxml.html.clean import Cleaner
 18 | from PIL.Image import Image
 19 | from PyPDF2 import PdfMerger, PdfReader
 20 | from PyPDF2.errors import PdfReadError
 21 | from seal_rookery.search import ImageSizes, seal
 22 | 
 23 | from doctor.lib.mojibake import fix_mojibake
 24 | from doctor.lib.text_extraction import (
 25 |     extract_with_ocr,
 26 |     get_page_text,
 27 |     page_needs_ocr,
 28 |     remove_excess_whitespace,
 29 | )
 30 | from doctor.lib.utils import (
 31 |     DoctorUnicodeDecodeError,
 32 |     force_bytes,
 33 |     force_text,
 34 |     ocr_needed,
 35 |     smart_text,
 36 | )
 37 | 
 38 | 
 39 | def strip_metadata_from_bytes(pdf_bytes):
 40 |     """Convert PDF bytes into PDF and remove metadata from it
 41 | 
 42 |     Stripping the metadata allows us to hash the PDFs
 43 | 
 44 |     :param pdf_bytes: PDF as binary content
 45 |     :return: PDF bytes with metadata removed.
 46 |     """
 47 |     pdf_merger = PdfMerger()
 48 |     pdf_merger.append(io.BytesIO(pdf_bytes))
 49 |     pdf_merger.add_metadata({"/CreationDate": "", "/ModDate": ""})
 50 |     byte_writer = io.BytesIO()
 51 |     pdf_merger.write(byte_writer)
 52 |     return force_bytes(byte_writer.getvalue())
 53 | 
 54 | 
 55 | def pdf_bytes_from_images(image_list: list[Image]):
 56 |     """Make a pdf given an array of Image files
 57 | 
 58 |     :param image_list: List of images
 59 |     :type image_list: list
 60 |     :return: PDF as bytes
 61 |     """
 62 |     with io.BytesIO() as output:
 63 |         image_list[0].save(
 64 |             output,
 65 |             "PDF",
 66 |             resolution=100.0,
 67 |             save_all=True,
 68 |             append_images=image_list[1:],
 69 |         )
 70 |         pdf_data = output.getvalue()
 71 | 
 72 |     return pdf_data
 73 | 
 74 | 
 75 | def make_pdftotext_process(path):
 76 |     """Make a subprocess to hand to higher-level code.
 77 | 
 78 |     :param path: File location
 79 |     :return: Subprocess results
 80 |     """
 81 | 
 82 |     process = subprocess.Popen(
 83 |         ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"],
 84 |         shell=False,
 85 |         stdout=subprocess.PIPE,
 86 |         stderr=subprocess.DEVNULL,
 87 |     )
 88 |     content, err = process.communicate()
 89 |     return content.decode(), err, process.returncode
 90 | 
 91 | 
 92 | def rasterize_pdf(path, destination):
 93 |     """Convert the PDF into a multipage Tiff file.
 94 | 
 95 |     This function uses ghostscript for processing and borrows heavily from:
 96 | 
 97 |         https://github.com/jbarlow83/OCRmyPDF/blob/636d1903b35fed6b07a01af53769fea81f388b82/ocrmypdf/ghostscript.py#L11
 98 | 
 99 |     """
100 |     # gs docs, see: http://ghostscript.com/doc/7.07/Use.htm
101 |     # gs devices, see: http://ghostscript.com/doc/current/Devices.htm
102 |     #
103 |     # Compression is a trade off. It takes twice as long to convert PDFs, but
104 |     # they're about 1-2% the size of the uncompressed version. They take about
105 |     # 30% of the RAM when Tesseract processes them. See:
106 |     # https://github.com/tesseract-ocr/tesseract/issues/431#issuecomment-250549208
107 |     # destination = "/tmp/tmppzo3zzah.tiff"
108 |     # gs -dQUIET -dSAFER -dBATCH -dNOPAUSE -sDEVICE=tiffgray -sCompression=lzw -r300x300 -o
109 |     gs = [
110 |         "gs",
111 |         "-dQUIET",  # Suppress printing routine info
112 |         "-dSAFER",  # Lock down the filesystem to only files on command line
113 |         "-dBATCH",  # Exit after finishing file. Don't wait for more commands.
114 |         "-dNOPAUSE",  # Don't pause after each page
115 |         "-sDEVICE=tiffgray",
116 |         "-sCompression=lzw",
117 |         "-r300x300",  # Set the resolution to 300 DPI.
118 |         "-o",
119 |         destination,
120 |         path,
121 |     ]
122 | 
123 |     p = subprocess.Popen(
124 |         gs,
125 |         close_fds=True,
126 |         stdout=subprocess.PIPE,
127 |         stderr=subprocess.PIPE,
128 |         universal_newlines=True,
129 |     )
130 |     stdout, stderr = p.communicate()
131 |     return stdout, stderr, p.returncode
132 | 
133 | 
134 | def get_xray(path):
135 |     """Get bad redactions
136 | 
137 |     :param path: A path to the file
138 | 
139 |     :return: dictionary of bounding boxes.
140 |     """
141 |     try:
142 |         bad_redactions = xray.inspect(path)
143 |         return bad_redactions
144 |     except (
145 |         OSError,
146 |         ValueError,
147 |         TypeError,
148 |         KeyError,
149 |         AssertionError,
150 |         PdfReadError,
151 |     ):
152 |         return {"error": True, "msg": "Exception"}
153 |     except Exception:
154 |         return {"error": True, "msg": "Exception"}
155 |     # not reached
156 | 
157 | 
158 | def get_page_count(path, extension):
159 |     """Get the number of pages, if appropriate mimetype.
160 | 
161 |     :param path: A path to a binary (pdf, wpd, doc, txt, html, etc.)
162 |     :param extension: The extension of the binary.
163 |     :return: The number of pages if possible, else return None
164 |     """
165 |     if extension == "pdf":
166 |         try:
167 |             reader = PdfReader(path)
168 |             return len(reader.pages)
169 |         except (
170 |             OSError,
171 |             ValueError,
172 |             TypeError,
173 |             KeyError,
174 |             AssertionError,
175 |             PdfReadError,
176 |         ):
177 |             # IOError: File doesn't exist. My bad.
178 |             # ValueError: Didn't get an int for the page count. Their bad.
179 |             # TypeError: NumberObject has no attribute '__getitem__'. Ugh.
180 |             # KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF?
181 |             # PdfReadError: Something else. I have no words.
182 |             return 0
183 | 
184 |     elif extension == "wpd":
185 |         # Best solution appears to be to dig into the binary format
186 |         pass
187 |     elif extension == "doc":
188 |         # Best solution appears to be to dig into the XML of the file
189 |         # itself: http://stackoverflow.com/a/12972502/64911
190 |         pass
191 |     return None
192 | 
193 | 
194 | def extract_from_pdf(
195 |     path: str,
196 |     original_filename: str,
197 |     ocr_available: bool = False,
198 | ) -> Any:
199 |     """Extract text from pdfs.
200 | 
201 |     Start with pdftotext. If we we enabled OCR - and the the content is empty
202 |     or the PDF contains images, use tesseract. This pattern occurs because PDFs
203 |     can be images, text-based and a mix of the two. We check for images to
204 |     make sure we do OCR on mix-type PDFs.
205 | 
206 |     If a text-based PDF we fix corrupt PDFs from ca9.
207 | 
208 |     :param path: The path to the PDF
209 |     :param original_filename: The original file name of the PDF file.
210 |     :param ocr_available: Whether we should do OCR stuff
211 |     :return Tuple of the content itself and any errors we received
212 |     """
213 |     content, err, returncode = make_pdftotext_process(path)
214 |     extracted_by_ocr = False
215 |     if err is not None:
216 |         err = err.decode()
217 | 
218 |     if not ocr_available:
219 |         if "e" not in content:
220 |             # It's a corrupt PDF from ca9. Fix it.
221 |             content = fix_mojibake(content)
222 |     else:
223 |         if ocr_needed(path, content):
224 |             success, ocr_content = extract_by_ocr(path)
225 |             if success:
226 |                 # Check content length and take the longer of the two
227 |                 if len(ocr_content) > len(content):
228 |                     content = ocr_content
229 |                     # opinion.extracted_by_ocr = True
230 |                     extracted_by_ocr = True
231 |             elif content == "" or not success:
232 |                 content = "Unable to extract document content."
233 | 
234 |     return content, err, returncode, extracted_by_ocr
235 | 
236 | 
237 | def extract_by_ocr(path: str) -> (bool, str):
238 |     """Extract the contents of a PDF using OCR."""
239 |     fail_msg = (
240 |         "Unable to extract the content from this file. Please try "
241 |         "reading the original."
242 |     )
243 |     with NamedTemporaryFile(prefix="ocr_", suffix=".tiff", buffering=0) as tmp:
244 |         out, err, returncode = rasterize_pdf(path, tmp.name)
245 |         if returncode != 0:
246 |             return False, fail_msg
247 | 
248 |         txt = convert_file_to_txt(tmp.name)
249 |         txt = cleanup_ocr_text(txt)
250 | 
251 |     return True, txt
252 | 
253 | 
254 | def cleanup_ocr_text(txt: str) -> str:
255 |     """Do some basic cleanup to make OCR text better.
256 | 
257 |     Err on the side of safety. Don't make fixes that could cause other issues.
258 | 
259 |     :param txt: The txt output from the OCR engine.
260 |     :return: Txt output, cleaned up.
261 |     """
262 |     simple_replacements = (
263 |         ("Fi|ed", "Filed"),
264 |         (" Il ", " II "),
265 |     )
266 |     for replacement in simple_replacements:
267 |         txt = txt.replace(replacement[0], replacement[1])
268 |     return txt
269 | 
270 | 
271 | def convert_file_to_txt(path: str) -> str:
272 |     tesseract_command = [
273 |         "tesseract",
274 |         path,
275 |         "stdout",
276 |         "-l",
277 |         "eng",
278 |         "-c",
279 |         "tessedit_do_invert=0",  # Assume a white background for speed
280 |     ]
281 |     p = subprocess.Popen(
282 |         tesseract_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
283 |     )
284 |     return p.communicate()[0].decode()
285 | 
286 | 
287 | def convert_tiff_to_pdf_bytes(single_tiff_image: Image) -> ByteString:
288 |     """Split long tiff into page sized image
289 | 
290 |     :param single_tiff_image: One long tiff file
291 |     :return: PDF Bytes
292 |     """
293 |     width, height = single_tiff_image.size
294 |     image_list = []
295 |     i, page_width, page_height = 0, width, (1046 * (float(width) / 792))
296 |     while i < (height / page_height):
297 |         single_page = single_tiff_image.crop(
298 |             (0, (i * page_height), page_width, (i + 1) * page_height)
299 |         )
300 |         image_list.append(single_page)
301 |         i += 1
302 | 
303 |     pdf_bytes = pdf_bytes_from_images(image_list)
304 |     return pdf_bytes
305 | 
306 | 
307 | def extract_from_doc(path):
308 |     """Extract text from docs.
309 | 
310 |     We use antiword to pull the text out of MS Doc files.
311 |     """
312 |     process = subprocess.Popen(
313 |         ["antiword", path, "-i", "1"],
314 |         shell=False,
315 |         stdout=subprocess.PIPE,
316 |         stderr=subprocess.DEVNULL,
317 |     )
318 |     content, err = process.communicate()
319 |     return content.decode("utf-8"), err, process.returncode
320 | 
321 | 
322 | def extract_from_docx(path):
323 |     """Extract text from docx files
324 | 
325 |     We use docx2txt to pull out the text. Pretty simple.
326 |     """
327 |     process = subprocess.Popen(
328 |         ["docx2txt", path, "-"],
329 |         shell=False,
330 |         stdout=subprocess.PIPE,
331 |         stderr=subprocess.DEVNULL,
332 |     )
333 |     content, err = process.communicate()
334 |     return content.decode("utf-8"), err, process.returncode
335 | 
336 | 
337 | def extract_from_html(path: str) -> tuple[str, str, int]:
338 |     """Extract from html file by attempting various encodings
339 | 
340 |     A simple wrapper to go get content, and send it along.
341 | 
342 |     :param path: The file path to the HTML file.
343 |     :return: A tuple containing:
344 |              - The extracted and cleaned text content (str), or an empty string on failure.
345 |              - An error message (str), or an empty string on success.
346 |              - A return code (int), typically 0 on success, 1 on failure.
347 |     """
348 |     for encoding in ["utf-8", "ISO8859", "cp1252", "latin-1"]:
349 |         try:
350 |             with open(path, encoding=encoding) as f:
351 |                 content = f.read()
352 |             content = get_clean_body_content(content)
353 |             content = force_text(content, encoding=encoding)
354 |             return content, "", 0
355 |         except (UnicodeDecodeError, DoctorUnicodeDecodeError):
356 |             pass
357 |     # Fell through, therefore unable to decode the string.
358 |     return "", "Could not encode content properly", 1
359 | 
360 | 
361 | def get_clean_body_content(content: str) -> str:
362 |     """Parse out the body from an html string, clean it up, and send it along.
363 | 
364 |     :param content: The HTML content as a string
365 |     :return: The cleaned HTML body content as a string, or a default error string on failure
366 |     """
367 |     cleaner = Cleaner(
368 |         style=True, remove_tags=["a", "body", "font", "noscript", "img"]
369 |     )
370 |     return cleaner.clean_html(content)
371 | 
372 | 
373 | def extract_from_txt(filepath):
374 |     """Extract text from plain text files: A fool's errand.
375 | 
376 |     Unfortunately, plain text files lack encoding information, so we have to
377 |     guess. We could guess ascii, but we may as well use a superset of ascii,
378 |     cp1252, and failing that try utf-8, ignoring errors. Most txt files we
379 |     encounter were produced by converting wpd or doc files to txt on a
380 |     Microsoft box, so assuming cp1252 as our first guess makes sense.
381 | 
382 |     May we hope for a better world.
383 |     """
384 |     err = None
385 |     error_code = 0
386 |     try:
387 |         with open(filepath) as f:
388 |             data = f.read()
389 |         try:
390 |             # Alas, cp1252 is probably still more popular than utf-8.
391 |             content = smart_text(data, encoding="cp1252")
392 |         except DoctorUnicodeDecodeError:
393 |             content = smart_text(data, encoding="utf-8", errors="ignore")
394 |     except Exception:
395 |         try:
396 |             with open(filepath, "rb") as f:
397 |                 blob = f.read()
398 |             m = magic.Magic(mime_encoding=True)
399 |             encoding = m.from_buffer(blob)
400 |             with open(filepath, encoding=encoding) as f:
401 |                 data = f.read()
402 |             content = smart_text(data, encoding=encoding, errors="ignore")
403 |         except Exception:
404 |             err = "An error occurred extracting txt file."
405 |             content = ""
406 |             error_code = 1
407 |     return content, err, error_code
408 | 
409 | 
410 | def extract_from_wpd(path: str) -> tuple[str, bytes, int]:
411 |     """Extract text from a Word Perfect file
412 | 
413 |     Yes, courts still use these, so we extract their text using wpd2html. Once
414 |     that's done, we pull out the body of the HTML, and do some minor cleanup
415 |     on it.
416 | 
417 |     :param path: The file path to the Word Perfect (.wpd) file.
418 |     :return: A tuple containing:
419 |              - The extracted and cleaned text content (str)
420 |              - The standard error output from the wpd2html subprocess (bytes)
421 |              - The return code of the wpd2html subprocess (int). Returns 1 on Python-level errors
422 |     """
423 |     process = subprocess.Popen(
424 |         ["wpd2html", path],
425 |         shell=False,
426 |         stdout=subprocess.PIPE,
427 |         stderr=subprocess.DEVNULL,
428 |     )
429 |     content_bytes, err = process.communicate()
430 |     content_str = content_bytes.decode("utf-8")
431 |     content = get_clean_body_content(content_str)
432 | 
433 |     return content, err, process.returncode
434 | 
435 | 
436 | def download_images(sorted_urls) -> list:
437 |     """Download images and convert to list of PIL images
438 | 
439 |     Once in an array of PIL.images we can easily convert this to a PDF.
440 | 
441 |     :param sorted_urls: List of sorted URLs for split financial disclosure
442 |     :return: image_list
443 |     """
444 | 
445 |     async def main(urls):
446 |         image_list = []
447 |         loop = asyncio.get_event_loop()
448 |         futures = [
449 |             loop.run_in_executor(None, requests.get, url) for url in urls
450 |         ]
451 |         for response in await asyncio.gather(*futures):
452 |             image_list.append(response.content)
453 |         return image_list
454 | 
455 |     loop = asyncio.get_event_loop()
456 |     image_list = loop.run_until_complete(main(sorted_urls))
457 | 
458 |     return image_list
459 | 
460 | 
461 | # Audio
462 | 
463 | root = os.path.dirname(os.path.realpath(__file__))
464 | assets_dir = os.path.join(root, "assets")
465 | 
466 | 
467 | def convert_to_mp3(output_path: AnyStr, media: Any) -> None:
468 |     """Convert audio bytes to mp3 at temporary path
469 | 
470 |     :param output_path: Audio file bytes sent to Doctor
471 |     :param media: Temporary filepath for output of audioprocess
472 |     :return:
473 |     """
474 |     av_command = [
475 |         "ffmpeg",
476 |         "-i",
477 |         "/dev/stdin",
478 |         "-ar",
479 |         "22050",
480 |         "-ab",
481 |         "48k",
482 |         "-f",
483 |         "mp3",
484 |         output_path,
485 |     ]
486 | 
487 |     ffmpeg_cmd = subprocess.Popen(
488 |         av_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False
489 |     )
490 |     ffmpeg_cmd.communicate(media.read())
491 |     return output_path
492 | 
493 | 
494 | def convert_to_ogg(output_path: AnyStr, media: Any) -> None:
495 |     """Converts audio data to the ogg format (.ogg)
496 | 
497 |     This function uses ffmpeg to convert the audio data provided in `media` to
498 |     the ogg format with the following specifications:
499 | 
500 |     * Single audio channel (`-ac 1`)
501 |     * 8 kHz sampling rate (`-b:a 8k`)
502 |     * Optimized for voice over IP applications (`-application voip`)
503 | 
504 |     :param output_path: Audio file bytes sent to Doctor
505 |     :param media: Temporary filepath for output of audioprocess
506 |     :return:
507 |     """
508 |     av_command = [
509 |         "ffmpeg",
510 |         "-i",
511 |         "/dev/stdin",
512 |         "-vn",
513 |         "-map_metadata",
514 |         "-1",
515 |         "-ac",
516 |         "1",
517 |         "-c:a",
518 |         "libopus",
519 |         "-b:a",
520 |         "8k",
521 |         "-application",
522 |         "voip",
523 |         "-f",
524 |         "ogg",
525 |         output_path,
526 |     ]
527 | 
528 |     ffmpeg_cmd = subprocess.Popen(
529 |         av_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False
530 |     )
531 |     ffmpeg_cmd.communicate(media.read())
532 |     return output_path
533 | 
534 | 
535 | def set_mp3_meta_data(
536 |     audio_data: dict, mp3_path: AnyStr
537 | ) -> eyed3.core.AudioFile:
538 |     """Set the metadata in audio_data to an mp3 at path.
539 | 
540 |     :param audio_data: The new metadata to embed in the mp3.
541 |     :param mp3_path: The path to the mp3 to be converted.
542 |     :return: Eyed3 audio file object
543 |     """
544 | 
545 |     # Load the file, delete the old tags and create a new one.
546 |     audio_file = eyed3.load(mp3_path)
547 |     # Undocumented API from eyed3.plugins.classic.ClassicPlugin#handleRemoves
548 |     id3.Tag.remove(
549 |         audio_file.tag.file_info.name,
550 |         id3.ID3_ANY_VERSION,
551 |         preserve_file_time=False,
552 |     )
553 |     audio_file.initTag()
554 |     audio_file.tag.title = best_case_name(audio_data)
555 |     date_argued = audio_data["date_argued"]
556 |     docket_number = audio_data["docket_number"]
557 |     audio_file.tag.album = (
558 |         f"{audio_data['court_full_name']}, {audio_data['date_argued_year']}"
559 |     )
560 |     audio_file.tag.artist = audio_data["court_full_name"]
561 |     audio_file.tag.artist_url = audio_data["court_url"]
562 |     audio_file.tag.audio_source_url = audio_data["download_url"]
563 | 
564 |     audio_file.tag.comments.set(
565 |         f"Argued: {date_argued}. Docket number: {docket_number}"
566 |     )
567 |     audio_file.tag.genre = "Speech"
568 |     audio_file.tag.publisher = "Free Law Project"
569 |     audio_file.tag.publisher_url = "https://free.law"
570 |     audio_file.tag.recording_date = date_argued
571 | 
572 |     # Add images to the mp3. If it has a seal, use that for the Front Cover
573 |     # and use the FLP logo for the Publisher Logo. If it lacks a seal, use the
574 |     # Publisher logo for both the front cover and the Publisher logo.
575 |     url = seal(court=audio_data["court_pk"], size=ImageSizes.MEDIUM)
576 | 
577 |     flp_image_frames = [
578 |         3,  # "Front Cover". Complete list at eyed3/id3/frames.py
579 |         14,  # "Publisher logo".
580 |     ]
581 | 
582 |     if url:
583 |         seal_content = requests.get(url, timeout=30).content
584 |         audio_file.tag.images.set(
585 |             3,
586 |             seal_content,
587 |             "image/png",
588 |             f"Seal for {audio_data['court_short_name']}",
589 |         )
590 |         flp_image_frames.remove(3)
591 | 
592 |     for frame in flp_image_frames:
593 |         cover_art_fp = os.path.join(assets_dir, "producer-300x300.png")
594 |         with open(cover_art_fp, "rb") as cover_art:
595 |             audio_file.tag.images.set(
596 |                 frame,
597 |                 cover_art.read(),
598 |                 "image/png",
599 |                 "Created for the public domain by Free Law Project",
600 |             )
601 | 
602 |     audio_file.tag.save()
603 |     return audio_file
604 | 
605 | 
606 | def convert_to_base64(tmp_path: AnyStr) -> AnyStr:
607 |     """Convert file base64 and decode it.
608 | 
609 |     This allows us to safely return the file in json to CL.
610 | 
611 |     :param tmp_path:
612 |     :return: Audio file encoded in base64 as a string
613 |     """
614 |     with open(tmp_path, "rb") as f:
615 |         return base64.b64encode(f.read()).decode()
616 | 
617 | 
618 | def best_case_name(audio_dict: dict) -> AnyStr:
619 |     """Take an object and return the highest quality case name possible.
620 | 
621 |     In general, this means returning the fields in an order like:
622 | 
623 |         - case_name
624 |         - case_name_full
625 |         - case_name_short
626 | 
627 |     Assumes that the object passed in has all of those attributes.
628 |     """
629 |     if audio_dict.get("case_name"):
630 |         return audio_dict.get("case_name")
631 |     elif audio_dict.get("case_name_full"):
632 |         return audio_dict["case_name_full"]
633 |     else:
634 |         return audio_dict.get("case_name_short", "")
635 | 
636 | 
637 | def get_header_stamp(obj: dict) -> bool:
638 |     """pdfplumber filter to extract the PDF header stamp.
639 | 
640 |     :param obj: The page object to evaluate.
641 |     :return: True if the found it, otherwise False.
642 |     """
643 | 
644 |     # This option works for most juridictions except for ca5
645 |     if "LiberationSans" in obj.get("fontname", ""):
646 |         return True
647 |     # Exception for ca5
648 |     return obj["y0"] > 750
649 | 
650 | 
651 | def clean_document_number(document_number: str) -> str:
652 |     """Removes #, leading and ending whitespaces from the document number.
653 | 
654 |     :param document_number: The document number to clean
655 |     :return: The cleaned document number.
656 |     """
657 |     document_number = document_number.strip()
658 |     document_number = document_number.replace("#", "")
659 |     return document_number
660 | 
661 | 
662 | def get_document_number_from_pdf(path: str) -> str:
663 |     """Get PACER document number from PDF.
664 | 
665 |     :param path: The path to the PDF
666 |     :return: The PACER document number.
667 |     """
668 | 
669 |     with pdfplumber.open(path) as f:
670 |         header_stamp = f.pages[0].filter(get_header_stamp).extract_text()
671 | 
672 |     # regex options to extract the document number
673 |     regex = r"Document:(.[0-9.\-.\#]+)|Document(.[0-9.\-.\#]+)|Doc:(.[0-9.\-.\#]+)|DktEntry:(.[0-9.\-.\#]+)"
674 |     document_number_matches = re.findall(regex, header_stamp)
675 | 
676 |     # If not matches return a empty string.
677 |     if not document_number_matches:
678 |         return ""
679 |     document_number = [dn for dn in document_number_matches[0] if dn]
680 |     return clean_document_number(document_number[0])
681 | 
682 | 
683 | def extract_recap_pdf(
684 |     filepath: str,
685 |     strip_margin: bool = False,
686 | ) -> tuple[str, bool]:
687 |     """Extract from RECAP PDF
688 | 
689 |     :param filepath: The path to the PDF
690 |     :param strip_margin: Whether to remove 1 inch margin from text extraction
691 |     :return: A tuple containing the text and a boolean indicating ocr usage
692 |     """
693 |     content = ""
694 |     extracted_by_ocr = False
695 |     with pdfplumber.open(filepath) as pdf:
696 |         for page in pdf.pages:
697 |             page_text = get_page_text(page, strip_margin=strip_margin)
698 |             if page_needs_ocr(page, page_text):
699 |                 extracted_by_ocr = True
700 |                 page_text = extract_with_ocr(page, strip_margin=strip_margin)
701 |             content += f"\n{page_text}"
702 |     content = remove_excess_whitespace(content)
703 |     return content, extracted_by_ocr
704 | 


--------------------------------------------------------------------------------
/doctor/test_assets/1.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1.mp3


--------------------------------------------------------------------------------
/doctor/test_assets/1.wma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1.wma


--------------------------------------------------------------------------------
/doctor/test_assets/1_with_metadata.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/1_with_metadata.mp3


--------------------------------------------------------------------------------
/doctor/test_assets/ander_v._leo.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/ander_v._leo.mp3


--------------------------------------------------------------------------------
/doctor/test_assets/broken-mime.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/broken-mime.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/empty.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/empty.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/image-pdf-2-thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-2-thumbnail.png


--------------------------------------------------------------------------------
/doctor/test_assets/image-pdf-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-2.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/image-pdf-thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf-thumbnail.png


--------------------------------------------------------------------------------
/doctor/test_assets/image-pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/image-pdf.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/long-image.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/long-image.tiff


--------------------------------------------------------------------------------
/doctor/test_assets/missouri.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/missouri.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/ocr_pdf_variation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/ocr_pdf_variation.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca10_010110462922.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca10_010110462922.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca1_00117684624.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca1_00117684624.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca2_1-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca2_1-1.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca3_003112692106.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca3_003112692106.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca4_17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca4_17.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca5_00516242060.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca5_00516242060.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca6_1-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca6_1-3.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca7_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca7_3.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca8_.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca8_.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/ca9_19.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/ca9_19.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_documents/cafc_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_documents/cafc_3.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.azd.1085839.3.0.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/vector-pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/vector-pdf.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/word-doc.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-doc.doc


--------------------------------------------------------------------------------
/doctor/test_assets/word-docx.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-docx.docx


--------------------------------------------------------------------------------
/doctor/test_assets/word-perfect.wpd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/word-perfect.wpd


--------------------------------------------------------------------------------
/doctor/test_assets/x-ray/rectangles_no.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_no.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/x-ray/rectangles_yes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_yes.pdf


--------------------------------------------------------------------------------
/doctor/test_assets/x-ray/rectangles_yes_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freelawproject/doctor/94c2ec669d033b0f54b6477b0e07882189e6e474/doctor/test_assets/x-ray/rectangles_yes_2.pdf


--------------------------------------------------------------------------------
/doctor/tests.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import os
  4 | import re
  5 | import unittest
  6 | from pathlib import Path
  7 | from tempfile import NamedTemporaryFile
  8 | from unittest.mock import patch
  9 | from zipfile import ZipFile
 10 | 
 11 | import eyed3
 12 | import requests
 13 | 
 14 | from doctor.lib.text_extraction import (
 15 |     adjust_caption_lines,
 16 |     cleanup_content,
 17 |     get_word,
 18 |     insert_whitespace,
 19 |     remove_excess_whitespace,
 20 | )
 21 | from doctor.lib.utils import make_buffer, make_file
 22 | 
 23 | asset_path = f"{Path.cwd()}/doctor/test_assets"
 24 | 
 25 | 
 26 | class HeartbeatTests(unittest.TestCase):
 27 |     def test_heartbeat(self):
 28 |         """Can we curl the heartbeat endpoint?"""
 29 |         response = requests.get("http://doctor:5050/")
 30 |         self.assertEqual(
 31 |             response.text, "Heartbeat detected.", msg="Heartbeat failed"
 32 |         )
 33 | 
 34 | 
 35 | class RECAPExtractionTests(unittest.TestCase):
 36 |     def test_recap_extraction(self):
 37 |         """Can we extract from the new recap text endpoint"""
 38 |         files = make_file(
 39 |             filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf"
 40 |         )
 41 |         params = {"strip_margin": False}
 42 |         response = requests.post(
 43 |             "http://doctor:5050/extract/recap/text/",
 44 |             files=files,
 45 |             params=params,
 46 |         )
 47 |         first_line = response.json()["content"].splitlines()[0].strip()
 48 |         self.assertEqual(200, response.status_code, msg="Wrong status code")
 49 |         self.assertTrue(
 50 |             response.json()["extracted_by_ocr"], msg="Not extracted correctly"
 51 |         )
 52 |         self.assertEqual(
 53 |             "aséakOS- 08-0220 A25BA  BAD Gooonene 2627  Filed!  OL/2B/DE0IP ageahefi2of 2",
 54 |             first_line,
 55 |             msg="Wrong Text",
 56 |         )
 57 | 
 58 |     def test_recap_extraction_with_strip_margin(self):
 59 |         """Can we extract from the new recap text endpoint with strip margin?"""
 60 |         files = make_file(
 61 |             filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf"
 62 |         )
 63 |         params = {"strip_margin": True}
 64 |         response = requests.post(
 65 |             "http://doctor:5050/extract/recap/text/",
 66 |             files=files,
 67 |             params=params,
 68 |         )
 69 |         first_line = response.json()["content"].splitlines()[0].strip()
 70 |         self.assertEqual(200, response.status_code, msg="Wrong status code")
 71 |         self.assertEqual(
 72 |             "1  || DONALD W. CARLSON  [Bar No. 79258]",
 73 |             first_line,
 74 |             msg="Wrong Text",
 75 |         )
 76 | 
 77 |     def test_recap_strip_marign_with_multiple_shaped_pdfs(self):
 78 |         """Can we extract atypical shape pdf with strip margin?"""
 79 | 
 80 |         files = make_file(
 81 |             filename="recap_extract/gov.uscourts.azd.1085839.3.0.pdf"
 82 |         )
 83 |         params = {"strip_margin": True}
 84 |         response = requests.post(
 85 |             "http://doctor:5050/extract/recap/text/",
 86 |             files=files,
 87 |             params=params,
 88 |         )
 89 |         first_line = response.json()["content"].splitlines()[0].strip()
 90 |         self.assertEqual(200, response.status_code, msg="Wrong status code")
 91 |         self.assertEqual(
 92 |             "1   WO",
 93 |             first_line,
 94 |             msg="Wrong Text",
 95 |         )
 96 | 
 97 |     def test_strip_margin_without_ocr(self):
 98 |         """Can we extract from the new recap text endpoint with strip margin?"""
 99 |         files = make_file(
100 |             filename="recap_extract/gov.uscourts.cacd.652774.40.0.pdf"
101 |         )
102 |         params = {"strip_margin": True}
103 |         response = requests.post(
104 |             "http://doctor:5050/extract/recap/text/",
105 |             files=files,
106 |             params=params,
107 |         )
108 |         first_line = response.json()["content"].splitlines()[0].strip()
109 |         self.assertEqual(200, response.status_code, msg="Wrong status code")
110 |         self.assertEqual("1", first_line, msg="Wrong Text")
111 | 
112 | 
113 | class ExtractionTests(unittest.TestCase):
114 |     def test_pdf_to_text(self):
115 |         """"""
116 |         files = make_file(filename="vector-pdf.pdf")
117 |         data = {"ocr_available": True}
118 |         response = requests.post(
119 |             "http://doctor:5050/extract/doc/text/", files=files, data=data
120 |         )
121 |         text = response.json()["content"][:100].replace("\n", "").strip()
122 |         self.assertEqual(200, response.status_code, msg="Wrong status code")
123 |         self.assertEqual(
124 |             text,
125 |             "(Slip Opinion)              OCTOBER TERM, 2012                                       1",
126 |             msg=text,
127 |         )
128 | 
129 |     def test_content_extraction(self):
130 |         """"""
131 |         files = make_file(filename="vector-pdf.pdf")
132 |         data = {"ocr_available": False}
133 |         response = requests.post(
134 |             "http://doctor:5050/extract/doc/text/", files=files, data=data
135 |         )
136 |         self.assertTrue(response.ok, msg="Content extraction failed")
137 |         self.assertEqual(
138 |             response.json()["content"][:100].replace("\n", "").strip(),
139 |             "(Slip Opinion)              OCTOBER TERM, 2012                                       1",
140 |             msg="Failed to extract content from .pdf file",
141 |         )
142 |         self.assertFalse(
143 |             response.json()["extracted_by_ocr"],
144 |             msg="Failed to extract by OCR",
145 |         )
146 |         self.assertEqual(
147 |             response.json()["page_count"],
148 |             30,
149 |             msg="Failed to extract by OCR",
150 |         )
151 | 
152 |     def test_pdf_ocr_extraction(self):
153 |         files = make_file(filename="image-pdf.pdf")
154 |         params = {"ocr_available": True}
155 |         response = requests.post(
156 |             "http://doctor:5050/extract/doc/text/",
157 |             files=files,
158 |             params=params,
159 |         )
160 |         self.assertTrue(response.ok, msg="Content extraction failed")
161 |         content = response.json()["content"][:100].replace("\n", "").strip()
162 |         self.assertEqual(
163 |             content,
164 |             "(Slip Opinion) OCTOBER TERM, 2012 1SyllabusNOTE: Where it is feasible, a syllabus (headnote) wil",
165 |             msg="Failed to extract content from image .pdf file",
166 |         )
167 |         self.assertTrue(
168 |             response.json()["extracted_by_ocr"],
169 |             msg="Failed to extract by OCR",
170 |         )
171 | 
172 |     def test_pdf_v2_ocr_extraction(self):
173 |         files = make_file(filename="ocr_pdf_variation.pdf")
174 |         params = {"ocr_available": True}
175 |         response = requests.post(
176 |             "http://doctor:5050/extract/doc/text/",
177 |             files=files,
178 |             params=params,
179 |         )
180 |         self.assertTrue(response.ok, msg="Content extraction failed")
181 |         content = response.json()["content"][:100].replace("\n", "").strip()
182 |         self.assertIn(
183 |             "UNITED",
184 |             content,
185 |             msg="Failed to extract content from ocr_pdf_variation .pdf file",
186 |         )
187 |         self.assertTrue(
188 |             response.json()["extracted_by_ocr"],
189 |             msg="Failed to extract by OCR",
190 |         )
191 | 
192 |     def test_docx_format(self):
193 |         files = make_file(filename="word-docx.docx")
194 |         params = {"ocr_available": False}
195 |         response = requests.post(
196 |             "http://doctor:5050/extract/doc/text/",
197 |             files=files,
198 |             params=params,
199 |         )
200 |         self.assertTrue(response.ok, msg="Content extraction failed")
201 |         self.assertEqual(
202 |             response.json()["content"][:200].replace("\n", "").strip(),
203 |             "ex- Cpl,                                                                                                 Current Discharge and Applicant's RequestApplication R",
204 |             msg="Failed to extract content from .docx file",
205 |         )
206 | 
207 |     def test_doc_format(self):
208 |         files = make_file(filename="word-doc.doc")
209 |         data = {"ocr_available": False}
210 |         response = requests.post(
211 |             "http://doctor:5050/extract/doc/text/", files=files, data=data
212 |         )
213 |         self.assertTrue(response.ok, msg="Content extraction failed")
214 |         content = response.json()["content"][:100].replace("\n", "").strip()
215 |         self.assertEqual(
216 |             content,
217 |             "Attorneys for Appellant                            Attorneys for AppelleeSteve Carter",
218 |             msg="Failed to extract content from .doc file",
219 |         )
220 |         self.assertEqual(
221 |             response.json()["page_count"],
222 |             None,
223 |             msg="Failed to extract by OCR",
224 |         )
225 | 
226 |     def test_wpd_format(self):
227 |         files = make_file(filename="word-perfect.wpd")
228 |         data = {"ocr_available": False}
229 |         response = requests.post(
230 |             "http://doctor:5050/extract/doc/text/", files=files, data=data
231 |         )
232 |         self.assertTrue(response.ok, msg="Content extraction failed")
233 |         self.assertIn(
234 |             "ATTORNEY FOR APPELLANT",
235 |             response.json()["content"],
236 |             msg="Failed to extract content from WPD file",
237 |         )
238 |         self.assertEqual(
239 |             14259,
240 |             len(response.json()["content"]),
241 |             msg="Failed to extract content from WPD file",
242 |         )
243 | 
244 | 
245 | class ThumbnailTests(unittest.TestCase):
246 |     """Can we generate thumbnail images from PDF files"""
247 | 
248 |     def test_convert_pdf_to_thumbnail_png(self):
249 |         """Can we generate four thumbanils a pdf?"""
250 |         files = make_file(filename="image-pdf.pdf")
251 |         data = {"max_dimension": 350}
252 |         response = requests.post(
253 |             "http://doctor:5050/convert/pdf/thumbnail/",
254 |             files=files,
255 |             data=data,
256 |         )
257 |         with open("doctor/test_assets/image-pdf-thumbnail.png", "rb") as f:
258 |             answer = f.read()
259 |         self.assertEqual(answer, response.content)
260 | 
261 |         files = make_file(filename="image-pdf-2.pdf")
262 |         response = requests.post(
263 |             "http://doctor:5050/convert/pdf/thumbnail/", files=files
264 |         )
265 |         with open("doctor/test_assets/image-pdf-2-thumbnail.png", "rb") as f:
266 |             second_answer = f.read()
267 |         self.assertEqual(second_answer, response.content)
268 | 
269 |         files = make_file(filename="empty.pdf")
270 |         response = requests.post(
271 |             "http://doctor:5050/convert/pdf/thumbnail/", files=files
272 |         )
273 |         self.assertEqual(response.status_code, 400, msg="Wrong status code")
274 | 
275 |     def test_thumbnail_range(self):
276 |         """Can we generate a thumbnail for a range of pages?"""
277 |         files = make_file(filename="vector-pdf.pdf")
278 |         pages = [1, 2, 3, 4]
279 |         data = {
280 |             "max_dimension": 350,
281 |             "pages": json.dumps(pages),
282 |         }
283 | 
284 |         response = requests.post(
285 |             "http://doctor:5050/convert/pdf/thumbnails/",
286 |             files=files,
287 |             data=data,
288 |         )
289 |         with NamedTemporaryFile(suffix=".zip") as tmp:
290 |             with open(tmp.name, "wb") as f:
291 |                 f.write(response.content)
292 |             with ZipFile(tmp.name, "r") as zipObj:
293 |                 listOfiles = sorted(zipObj.namelist())
294 |         self.assertEqual(len(listOfiles), 4)
295 |         self.assertEqual(
296 |             ["thumb-1.png", "thumb-2.png", "thumb-3.png", "thumb-4.png"],
297 |             listOfiles,
298 |         )
299 | 
300 | 
301 | class MetadataTests(unittest.TestCase):
302 |     """Can we count page numbers in PDF files"""
303 | 
304 |     def test_page_count_pdf(self):
305 |         """"""
306 |         files = make_file(filename="image-pdf.pdf")
307 |         page_count = requests.post(
308 |             "http://doctor:5050/utils/page-count/pdf/", files=files
309 |         ).text
310 |         self.assertEqual(int(page_count), 2, "Failed to get page count")
311 | 
312 |     def test_mime_type(self):
313 |         """"""
314 |         files = make_file(filename="image-pdf.pdf")
315 |         params = {"mime": True}
316 |         response = requests.post(
317 |             "http://doctor:5050/utils/mime-type/",
318 |             files=files,
319 |             params=params,
320 |         ).json()
321 |         self.assertEqual(
322 |             response["mimetype"],
323 |             "application/pdf",
324 |             msg="Failed to get mime type",
325 |         )
326 | 
327 |     def test_broken_mime_type(self):
328 |         """"""
329 |         files = make_buffer(filename="broken-mime.pdf")
330 |         params = {"mime": True}
331 |         response = requests.post(
332 |             "http://doctor:5050/utils/file/extension/",
333 |             files=files,
334 |             params=params,
335 |         )
336 |         self.assertEqual(response.text, ".pdf", msg="Failed to get mime type")
337 | 
338 |         files = make_buffer(filename="missouri.pdf")
339 |         params = {"mime": True}
340 |         response = requests.post(
341 |             "http://doctor:5050/utils/file/extension/",
342 |             files=files,
343 |             params=params,
344 |         )
345 |         self.assertEqual(response.text, ".pdf", msg="Failed to get mime type")
346 | 
347 |     def test_mime_type_unknown_name(self):
348 |         """"""
349 |         files = make_buffer(filename="image-pdf.pdf")
350 |         response = requests.post(
351 |             "http://doctor:5050/utils/mime-type/",
352 |             files=files,
353 |             params={"mime": True},
354 |         ).json()
355 |         self.assertEqual(
356 |             response["mimetype"],
357 |             "application/pdf",
358 |             msg="Failed to get mime type",
359 |         )
360 | 
361 |     def test_get_extension(self):
362 |         """"""
363 |         files = make_buffer(filename="image-pdf.pdf")
364 |         response = requests.post(
365 |             "http://doctor:5050/utils/file/extension/", files=files
366 |         )
367 |         self.assertEqual(response.text, ".pdf", msg="Failed to get mime type")
368 | 
369 |         files = make_buffer(filename="word-docx.docx")
370 |         response = requests.post(
371 |             "http://doctor:5050/utils/file/extension/", files=files
372 |         )
373 |         self.assertEqual(response.text, ".docx", msg="Failed to get mime type")
374 |         files = make_buffer(filename="word-doc.doc")
375 |         response = requests.post(
376 |             "http://doctor:5050/utils/file/extension/", files=files
377 |         )
378 |         self.assertEqual(response.text, ".doc", msg="Failed to get mime type")
379 | 
380 |     def test_embedding_text_to_image_pdf(self):
381 |         """Can we embed text into an image PDF?"""
382 |         data = {"ocr_available": False}
383 | 
384 |         files = make_file(filename="image-pdf.pdf")
385 |         image_response = requests.post(
386 |             "http://doctor:5050/extract/doc/text/", files=files, data=data
387 |         )
388 |         self.assertEqual(
389 |             "",
390 |             image_response.json()["content"].strip("\x0c"),
391 |             msg="PDF should have no text",
392 |         )
393 | 
394 |         # Embed text into the image pdf and check that we get some text
395 |         new_pdf = requests.post(
396 |             "http://doctor:5050/utils/add/text/pdf/", files=files
397 |         )
398 |         with NamedTemporaryFile(suffix=".pdf") as tmp:
399 |             with open(tmp.name, "wb") as f:
400 |                 f.write(new_pdf.content)
401 |             with open(tmp.name, "rb") as f:
402 |                 files = {"file": (tmp.name, f.read())}
403 | 
404 |             # Confirm that text is now embedded in the PDF
405 |             response = requests.post(
406 |                 "http://doctor:5050/extract/doc/text/",
407 |                 files=files,
408 |                 data=data,
409 |             )
410 |             self.assertIn(
411 |                 "(SlipOpinion)             OCTOBER TERM, 2012",
412 |                 response.json()["content"],
413 |                 msg=f"Got {response.json()}",
414 |             )
415 | 
416 |     def test_get_document_number(self):
417 |         """Check if the PACER document number is correctly extracted from
418 |         documents from multiple jurisdictions.
419 |         """
420 | 
421 |         filepath = f"{Path.cwd()}/doctor/test_assets/recap_documents/"
422 |         for file in glob.glob(os.path.join(filepath, "*.pdf")):
423 |             filename = os.path.relpath(file, filepath)
424 |             filename_sans_ext = filename.split(".")[0]
425 |             doc_num = filename_sans_ext.split("_")[1]
426 | 
427 |             with open(file, "rb") as f:
428 |                 files = {"file": (filename, f.read())}
429 | 
430 |                 document_number = requests.post(
431 |                     "http://doctor:5050/utils/document-number/pdf/",
432 |                     files=files,
433 |                 ).text
434 | 
435 |             self.assertEqual(doc_num, document_number)
436 | 
437 | 
438 | class RedactionTest(unittest.TestCase):
439 |     def test_xray_no_pdf(self):
440 |         """Are we able to discover bad redacts?"""
441 |         filepath = f"{Path.cwd()}/doctor/test_assets/x-ray/"
442 |         test_files = (
443 |             "*yes*.pdf",
444 |             "*no*.pdf",
445 |         )
446 |         for pattern in test_files:
447 |             direction = re.search("yes", pattern)
448 |             for file in glob.glob(os.path.join(filepath, pattern)):
449 |                 filename = os.path.relpath(file, filepath)
450 | 
451 |                 with open(file, "rb") as f:
452 |                     files = {"file": (filename, f.read())}
453 |                     response = requests.post(
454 |                         "http://doctor:5050/utils/check-redactions/pdf/",
455 |                         files=files,
456 |                     )
457 |                     # Break up the assertion so that testers can see which
458 |                     # part is actually failing
459 |                     self.assertTrue(response.ok)
460 |                     bb = response.json()
461 |                     self.assertFalse(bb["error"])
462 |                     if not direction:
463 |                         self.assertTrue(len(bb["results"]) == 0)
464 |                     else:
465 |                         self.assertFalse(len(bb["results"]) == 0)
466 | 
467 | 
468 | class ImageDisclosuresTest(unittest.TestCase):
469 |     def test_images_to_pdf(self):
470 |         """Do we create a PDF from several tiffs successfully?"""
471 |         base = "https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/financial-disclosures/2011/A-E/Armstrong-SB%20J3.%2009.%20CAN_R_11/Armstrong-SB%20J3.%2009.%20CAN_R_11_Page"
472 |         sorted_urls = [
473 |             f"{base}_1.tiff",
474 |             f"{base}_2.tiff",
475 |         ]
476 |         params = {"sorted_urls": json.dumps(sorted_urls)}
477 |         response = requests.post(
478 |             "http://doctor:5050/convert/images/pdf/",
479 |             params=params,
480 |         )
481 |         self.assertEqual(response.status_code, 200, msg="Failed status code.")
482 |         self.assertEqual(
483 |             b"%PDF-1.3\n",
484 |             response.content[:9],
485 |             msg="PDF generation failed",
486 |         )
487 | 
488 | 
489 | class AudioConversionTests(unittest.TestCase):
490 |     """Test Audio Conversion"""
491 | 
492 |     def test_wma_to_mp3(self):
493 |         """Can we convert to mp3 with metadata"""
494 | 
495 |         audio_details = {
496 |             "court_full_name": "Testing Supreme Court",
497 |             "court_short_name": "Testing Supreme Court",
498 |             "court_pk": "mad",
499 |             "court_url": "http://www.example.com/",
500 |             "docket_number": "docket number 1 005",
501 |             "date_argued": "2020-01-01",
502 |             "date_argued_year": "2020",
503 |             "case_name": "SEC v. Frank J. Custable, Jr.",
504 |             "case_name_full": "case name full",
505 |             "case_name_short": "short",
506 |             "download_url": "http://media.ca7.uscourts.gov/sound/external/gw.15-1442.15-1442_07_08_2015.mp3",
507 |         }
508 | 
509 |         files = make_file(filename="1.wma")
510 |         response = requests.post(
511 |             "http://doctor:5050/convert/audio/mp3/",
512 |             files=files,
513 |             params=audio_details,
514 |         )
515 |         self.assertEqual(response.status_code, 200, msg="Bad status code")
516 | 
517 |         # Validate some metadata in the MP3.
518 |         with NamedTemporaryFile(suffix=".mp3") as tmp:
519 |             with open(tmp.name, "wb") as mp3_data:
520 |                 mp3_data.write(response.content)
521 |                 mp3_file = eyed3.load(tmp.name)
522 | 
523 |             self.assertEqual(
524 |                 mp3_file.tag.publisher,
525 |                 "Free Law Project",
526 |                 msg="Publisher metadata failed.",
527 |             )
528 |             self.assertEqual(
529 |                 mp3_file.tag.title,
530 |                 "SEC v. Frank J. Custable, Jr.",
531 |                 msg="Title metadata failed.",
532 |             )
533 |             self.assertEqual(
534 |                 mp3_file.type,
535 |                 eyed3.core.AUDIO_MP3,
536 |                 msg="Audio conversion to mp3 failed.",
537 |             )
538 | 
539 |     def test_audio_duration(self):
540 |         files = make_file(filename="1.mp3")
541 |         response = requests.post(
542 |             "http://doctor:5050/utils/audio/duration/",
543 |             files=files,
544 |         )
545 |         self.assertEqual(51.64, float(response.text), msg="Bad duration")
546 | 
547 | 
548 | class TestFailedValidations(unittest.TestCase):
549 |     def test_for_400s(self):
550 |         """Test validation for missing audio file"""
551 |         response = requests.post(
552 |             "http://doctor:5050/utils/audio/duration/",
553 |         )
554 |         self.assertEqual(response.status_code, 400, msg="Wrong validation")
555 | 
556 |     def test_pdf_400s(self):
557 |         """Test validation for missing PDF file"""
558 |         response = requests.post(
559 |             "http://doctor:5050/extract/doc/text/",
560 |         )
561 |         self.assertEqual(
562 |             "Failed validation",
563 |             response.text,
564 |             msg="Wrong validation error",
565 |         )
566 |         self.assertEqual(response.status_code, 400, msg="Wrong validation")
567 | 
568 |     def test_pdf_400_mime(self):
569 |         """Test return 400 on missing file for mime extraction"""
570 |         response = requests.post(
571 |             "http://doctor:5050/utils/mime-type/",
572 |             params={"mime": True},
573 |         )
574 |         self.assertEqual(response.status_code, 400, msg="Wrong validation")
575 | 
576 | 
577 | class TestRecapWhitespaceInsertions(unittest.TestCase):
578 |     """Test our whitespace insertion code"""
579 | 
580 |     def test_insert_whitespace_new_line(self):
581 |         content = "foo"
582 |         word = {
583 |             "line_num": 2,
584 |             "par_num": 1,
585 |             "left": 50,
586 |             "top": 200,
587 |             "width": 10,
588 |             "height": 20,
589 |         }
590 |         prev = {
591 |             "line_num": 1,
592 |             "par_num": 1,
593 |             "left": 10,
594 |             "top": 100,
595 |             "width": 30,
596 |             "height": 20,
597 |         }
598 |         result = insert_whitespace(content, word, prev)
599 |         self.assertEqual(result, "foo\n  ")
600 | 
601 |     def test_insert_whitespace_new_paragraph(self):
602 |         content = "foo"
603 |         word = {
604 |             "line_num": 1,
605 |             "par_num": 2,
606 |             "left": 50,
607 |             "top": 200,
608 |             "width": 10,
609 |             "height": 20,
610 |         }
611 |         prev = {
612 |             "line_num": 2,
613 |             "par_num": 1,
614 |             "left": 10,
615 |             "top": 100,
616 |             "width": 30,
617 |             "height": 20,
618 |         }
619 |         result = insert_whitespace(content, word, prev)
620 |         self.assertEqual(result, "foo\n  ")
621 | 
622 |     def test_insert_whitespace_vertical_gap(self):
623 |         content = "foo"
624 |         word = {
625 |             "line_num": 2,
626 |             "par_num": 1,
627 |             "left": 50,
628 |             "top": 300,
629 |             "width": 10,
630 |             "height": 20,
631 |         }
632 |         prev = {
633 |             "line_num": 1,
634 |             "par_num": 1,
635 |             "left": 10,
636 |             "top": 100,
637 |             "width": 30,
638 |             "height": 20,
639 |         }
640 |         result = insert_whitespace(content, word, prev)
641 |         self.assertEqual(result, "foo\n\n  ")
642 | 
643 |     def test_insert_whitespace_horizontal_gap(self):
644 |         content = "foo"
645 |         word = {
646 |             "line_num": 1,
647 |             "par_num": 1,
648 |             "left": 200,
649 |             "top": 100,
650 |             "width": 10,
651 |             "height": 20,
652 |         }
653 |         prev = {
654 |             "line_num": 1,
655 |             "par_num": 1,
656 |             "left": 10,
657 |             "top": 100,
658 |             "width": 30,
659 |             "height": 20,
660 |         }
661 |         result = insert_whitespace(content, word, prev)
662 |         self.assertEqual(result, "foo      ")
663 | 
664 |     def test_insert_whitespace_no_gap(self):
665 |         content = "foo"
666 |         word = {
667 |             "line_num": 1,
668 |             "par_num": 1,
669 |             "left": 50,
670 |             "top": 100,
671 |             "width": 10,
672 |             "height": 20,
673 |         }
674 |         prev = {
675 |             "line_num": 1,
676 |             "par_num": 1,
677 |             "left": 40,
678 |             "top": 100,
679 |             "width": 10,
680 |             "height": 20,
681 |         }
682 |         result = insert_whitespace(content, word, prev)
683 |         self.assertEqual(result, "foo")
684 | 
685 | 
686 | class TestOCRConfidenceTests(unittest.TestCase):
687 |     """Test our OCR confidence checking functions."""
688 | 
689 |     def test_confidence_zero(self):
690 |         word_dict = {"text": "foo", "conf": 0, "left": 10, "width": 30}
691 |         result = get_word(word_dict, 612, True)
692 |         self.assertEqual(result, "    ")
693 | 
694 |     def test_confidence_low_and_in_margin(self):
695 |         word_dict = {"text": "foo", "conf": 30, "left": 5, "width": 20}
696 |         result = get_word(word_dict, 612, True)
697 |         self.assertEqual(result, "    ")
698 | 
699 |     def test_confidence_below_threshold_short_word(self):
700 |         word_dict = {"text": "foo", "conf": 3, "left": 200, "width": 20}
701 |         result = get_word(word_dict, 612, True)
702 |         self.assertEqual(result, "□□□ ")
703 | 
704 |     def test_confidence_below_threshold_long_word(self):
705 |         word_dict = {
706 |             "text": "foobarbazfoobarbazfoobar",
707 |             "conf": 3,
708 |             "left": 200,
709 |             "width": 200,
710 |         }
711 |         result = get_word(word_dict, 612, True)
712 |         self.assertEqual(result, "□□□□□□□□□□□□□□□□□□□□□□□□ ")
713 | 
714 |     def test_confidence_below_threshold_in_right_margin(self):
715 |         word_dict = {"text": "foo", "conf": 30, "left": 580, "width": 10}
716 |         result = get_word(word_dict, 612, True)
717 |         self.assertEqual(result, "□□□ ")
718 | 
719 |     def test_valid_word_high_confidence(self):
720 |         word_dict = {"text": "foo", "conf": 90, "left": 50, "width": 20}
721 |         result = get_word(word_dict, 612, True)
722 |         self.assertEqual(result, "foo ")
723 | 
724 |     def test_word_on_left_edge(self):
725 |         word_dict = {"text": "foo", "conf": 50, "left": 0, "width": 20}
726 |         result = get_word(word_dict, 612, True)
727 |         self.assertEqual(result, "    ")
728 | 
729 | 
730 | class TestWhiteSpaceRemoval(unittest.TestCase):
731 |     def test_left_shift(self):
732 |         """Can we properly shift our text left?"""
733 |         document = """
734 |         foo
735 |     bar
736 |     foo
737 |     bar"""
738 |         expected_result = """    foo
739 | bar
740 | foo
741 | bar"""
742 |         result = remove_excess_whitespace(document)
743 |         self.assertEqual(result, expected_result)
744 | 
745 |     def test_left_shift_when_artifact_exists(self):
746 |         """Shift left once"""
747 |         document = """
748 |         foo
749 |     bar
750 |  |  foo
751 |     bar"""
752 |         expected_result = """       foo
753 |    bar
754 | |  foo
755 |    bar"""
756 |         result = remove_excess_whitespace(document)
757 |         self.assertEqual(result, expected_result)
758 | 
759 | 
760 | class TestCleanupContent(unittest.TestCase):
761 |     def setUp(self):
762 |         # Patch the functions before each test method
763 |         patcher1 = patch(
764 |             "doctor.lib.text_extraction.adjust_caption_lines",
765 |             side_effect=lambda x: x,
766 |         )
767 |         patcher2 = patch(
768 |             "doctor.lib.text_extraction.remove_excess_whitespace",
769 |             side_effect=lambda x: x,
770 |         )
771 |         self.mock_adjust = patcher1.start()
772 |         self.mock_remove_whitespace = patcher2.start()
773 |         self.addCleanup(patcher1.stop)
774 |         self.addCleanup(patcher2.stop)
775 | 
776 |     def test_remove_floating_pipes(self):
777 |         """Can we remove a pipe"""
778 |         content = "This is a test line     | \nAnother line"
779 |         expected_result = "This is a test line\nAnother line\n"
780 |         result = cleanup_content(content, 2)
781 |         self.assertEqual(result, expected_result)
782 | 
783 |     def test_remove_floating_artifacts_right_side(self):
784 |         """Can we remove an artifact on the far right"""
785 |         content = "This is a test line          e \nAnother line"
786 |         expected_result = "This is a test line\nAnother line\n"
787 |         result = cleanup_content(content, 2)
788 |         self.assertEqual(result, expected_result)
789 | 
790 |     def test_remove_floating_pipes_and_artifacts(self):
791 |         """Test to remove just the period"""
792 |         content = "This is a test line     | and the content continues\nThis is another test line              e \nFinal line"
793 |         expected_result = "This is a test line     | and the content continues\nThis is another test line\nFinal line\n"
794 |         result = cleanup_content(content, 2)
795 |         self.assertEqual(result, expected_result)
796 | 
797 |     def test_no_floating_pipes_or_artifacts(self):
798 |         """Test that no floating pipes are an issue"""
799 |         content = (
800 |             "This is a test line                     JW-6\nAnother line\n"
801 |         )
802 |         expected_result = (
803 |             "This is a test line                     JW-6\nAnother line\n\n"
804 |         )
805 |         result = cleanup_content(content, 2)
806 |         self.assertEqual(result, expected_result)
807 | 
808 |     def test_adjust_caption(self):
809 |         """Test if we can align the caption correctly"""
810 |         content = """             10
811 |                  LESLIE MASSEY,                    )  Case No.:  2:16-cv-05001 GJS
812 |                                                        )
813 |                                  oe                    )  PROPOSED} ORDER AWARDING
814 |              12               Plaintiff,                    )   EQUAL ACCESS TO JUSTICE ACT
815 |                                                 )    ATTORNEY FEES AND EXPENSES
816 |              13         VS.                              )  PURSUANT TO 28 U.S.C. § 2412(d)
817 |                  NANCY A. BERRYHILL, Acting      )  AND COSTS PURSUANT TO 28
818 |              14 || Commissioner of Social Security,       )  U.S.C. §  1920
819 |              15               Defendant                 )
820 |              16                                         ) """
821 | 
822 |         expected_result = """             10
823 |                  LESLIE MASSEY,                             )  Case No.:  2:16-cv-05001 GJS
824 |                                                             )
825 |                                  oe                         )  PROPOSED} ORDER AWARDING
826 |              12               Plaintiff,                    )   EQUAL ACCESS TO JUSTICE ACT
827 |                                                             )    ATTORNEY FEES AND EXPENSES
828 |              13         VS.                                 )  PURSUANT TO 28 U.S.C. § 2412(d)
829 |                  NANCY A. BERRYHILL, Acting                 )  AND COSTS PURSUANT TO 28
830 |              14 || Commissioner of Social Security,         )  U.S.C. §  1920
831 |              15               Defendant                     )
832 |              16                                             ) """
833 |         content = adjust_caption_lines(content)
834 |         self.assertEqual(expected_result, content)
835 | 
836 | 
837 | if __name__ == "__main__":
838 |     unittest.main()
839 | 


--------------------------------------------------------------------------------
/doctor/urls.py:
--------------------------------------------------------------------------------
 1 | from django.urls import path, re_path
 2 | 
 3 | from . import views
 4 | 
 5 | urlpatterns = [
 6 |     # Server
 7 |     path("", views.heartbeat, name="heartbeat"),
 8 |     path(
 9 |         "extract/doc/text/",
10 |         views.extract_doc_content,
11 |         name="convert-doc-to-text",
12 |     ),
13 |     path(
14 |         "extract/recap/text/",
15 |         views.extract_recap_document,
16 |         name="extract-recap-document",
17 |     ),
18 |     path("convert/image/pdf/", views.image_to_pdf, name="image-to-pdf"),
19 |     path("convert/images/pdf/", views.images_to_pdf, name="images-to-pdf"),
20 |     path("convert/pdf/thumbnail/", views.make_png_thumbnail, name="thumbnail"),
21 |     path(
22 |         "convert/pdf/thumbnails/",
23 |         views.make_png_thumbnails_from_range,
24 |         name="thumbnails",
25 |     ),
26 |     re_path(
27 |         "convert/audio/(mp3|ogg)/", views.convert_audio, name="convert-audio"
28 |     ),
29 |     path("utils/page-count/pdf/", views.page_count, name="page_count"),
30 |     path("utils/mime-type/", views.extract_mime_type, name="mime_type"),
31 |     path(
32 |         "utils/file/extension/", views.extract_extension, name="file-extension"
33 |     ),
34 |     path(
35 |         "utils/audio/duration/",
36 |         views.fetch_audio_duration,
37 |         name="audio-duration",
38 |     ),
39 |     path("utils/add/text/pdf/", views.embed_text, name="add-text-to-pdf"),
40 |     path(
41 |         "utils/document-number/pdf/",
42 |         views.get_document_number,
43 |         name="document-number-pdf",
44 |     ),
45 |     path("utils/check-redactions/pdf/", views.xray, name="xray-pdf"),
46 | ]
47 | 


--------------------------------------------------------------------------------
/doctor/views.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import mimetypes
  3 | import re
  4 | import shutil
  5 | from http.client import BAD_REQUEST
  6 | from tempfile import NamedTemporaryFile, TemporaryDirectory
  7 | 
  8 | import eyed3
  9 | import img2pdf
 10 | import magic
 11 | import pytesseract
 12 | import requests
 13 | from django.core.exceptions import BadRequest
 14 | from django.http import FileResponse, HttpResponse, JsonResponse
 15 | from lxml.etree import ParserError, XMLSyntaxError
 16 | from PIL import Image
 17 | from PyPDF2 import PdfReader, PdfWriter
 18 | from pytesseract import Output
 19 | 
 20 | from doctor.forms import (
 21 |     AudioForm,
 22 |     BaseFileForm,
 23 |     DocumentForm,
 24 |     ImagePdfForm,
 25 |     MimeForm,
 26 |     ThumbnailForm,
 27 | )
 28 | from doctor.lib.utils import (
 29 |     cleanup_form,
 30 |     log_sentry_event,
 31 |     make_page_with_text,
 32 |     make_png_thumbnail_for_instance,
 33 |     make_png_thumbnails,
 34 |     strip_metadata_from_path,
 35 | )
 36 | from doctor.tasks import (
 37 |     convert_tiff_to_pdf_bytes,
 38 |     convert_to_mp3,
 39 |     convert_to_ogg,
 40 |     download_images,
 41 |     extract_from_doc,
 42 |     extract_from_docx,
 43 |     extract_from_html,
 44 |     extract_from_pdf,
 45 |     extract_from_txt,
 46 |     extract_from_wpd,
 47 |     extract_recap_pdf,
 48 |     get_document_number_from_pdf,
 49 |     get_page_count,
 50 |     get_xray,
 51 |     make_pdftotext_process,
 52 |     rasterize_pdf,
 53 |     set_mp3_meta_data,
 54 |     strip_metadata_from_bytes,
 55 | )
 56 | 
 57 | logger = logging.getLogger(__name__)
 58 | 
 59 | 
 60 | def heartbeat(request) -> HttpResponse:
 61 |     """Heartbeat endpoint
 62 | 
 63 |     :param request: The request object
 64 |     :return: Heartbeat
 65 |     """
 66 |     return HttpResponse("Heartbeat detected.")
 67 | 
 68 | 
 69 | def image_to_pdf(request) -> HttpResponse:
 70 |     """"""
 71 | 
 72 |     form = DocumentForm(request.POST, request.FILES)
 73 |     if not form.is_valid():
 74 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
 75 |     image = Image.open(form.cleaned_data["fp"])
 76 |     pdf_bytes = convert_tiff_to_pdf_bytes(image)
 77 |     cleaned_pdf_bytes = strip_metadata_from_bytes(pdf_bytes)
 78 |     with NamedTemporaryFile(suffix=".pdf") as output:
 79 |         with open(output.name, "wb") as f:
 80 |             f.write(cleaned_pdf_bytes)
 81 |         cleanup_form(form)
 82 |         return HttpResponse(cleaned_pdf_bytes)
 83 | 
 84 | 
 85 | def extract_recap_document(request) -> JsonResponse:
 86 |     """Extract Recap Documents
 87 | 
 88 |     :param request: The request object
 89 |     :return: JsonResponse
 90 |     """
 91 |     form = DocumentForm(request.GET, request.FILES)
 92 |     if not form.is_valid():
 93 |         return JsonResponse(
 94 |             {
 95 |                 "err": "Failed validation",
 96 |             },
 97 |             status=BAD_REQUEST,
 98 |         )
 99 |     filepath = form.cleaned_data["fp"]
100 |     strip_margin = form.cleaned_data["strip_margin"]
101 |     content, extracted_by_ocr = extract_recap_pdf(
102 |         filepath=filepath,
103 |         strip_margin=strip_margin,
104 |     )
105 |     cleanup_form(form)
106 |     return JsonResponse(
107 |         {
108 |             "content": content,
109 |             "extracted_by_ocr": extracted_by_ocr,
110 |         }
111 |     )
112 | 
113 | 
114 | def extract_doc_content(request) -> JsonResponse | HttpResponse:
115 |     """Extract txt from different document types.
116 | 
117 |     :return: The content of a document/error message.
118 |     :type: json object
119 |     """
120 |     form = DocumentForm(request.GET, request.FILES)
121 |     if not form.is_valid():
122 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
123 |     ocr_available = form.cleaned_data["ocr_available"]
124 |     extension = form.cleaned_data["extension"]
125 |     fp = form.cleaned_data["fp"]
126 |     extracted_by_ocr = False
127 |     err = ""
128 |     # We keep the original file name to use it for debugging purposes, you can find it in local_path (Opinion) field
129 |     # or filepath_local (AbstractPDF).
130 |     original_filename = form.cleaned_data["original_filename"]
131 |     try:
132 |         if extension == "pdf":
133 |             content, err, returncode, extracted_by_ocr = extract_from_pdf(
134 |                 fp, original_filename, ocr_available
135 |             )
136 |         elif extension == "doc":
137 |             content, err, returncode = extract_from_doc(fp)
138 |         elif extension == "docx":
139 |             content, err, returncode = extract_from_docx(fp)
140 |         elif extension == "html":
141 |             content, err, returncode = extract_from_html(fp)
142 |         elif extension == "txt":
143 |             content, err, returncode = extract_from_txt(fp)
144 |         elif extension == "wpd":
145 |             content, err, returncode = extract_from_wpd(fp)
146 |         else:
147 |             returncode = 1
148 |             err = "Unable to extract content due to unknown extension"
149 |             content = ""
150 | 
151 |         if returncode != 0:
152 |             log_sentry_event(
153 |                 logger=logger,
154 |                 level=logging.ERROR,
155 |                 message="Unable to extract document content",
156 |                 extra={
157 |                     "file_name": original_filename,
158 |                     "err": err,
159 |                 },
160 |                 exc_info=True,
161 |             )
162 |             pass
163 | 
164 |     except (XMLSyntaxError, ParserError) as e:
165 |         error_message = "HTML cleaning failed due to ParserError."
166 |         if isinstance(e, XMLSyntaxError):
167 |             error_message = "HTML cleaning failed due to XMLSyntaxError."
168 | 
169 |         log_sentry_event(
170 |             logger=logger,
171 |             level=logging.ERROR,
172 |             message=error_message,
173 |             extra={
174 |                 "file_name": original_filename,
175 |                 "exception_type": type(e).__name__,
176 |                 "exception_message": str(e),
177 |             },
178 |             exc_info=True,
179 |         )
180 |         content = "Unable to extract the content from this file. Please try reading the original."
181 | 
182 |     # Get page count if you can
183 |     page_count = get_page_count(fp, extension)
184 |     cleanup_form(form)
185 |     return JsonResponse(
186 |         {
187 |             "content": content,
188 |             "err": err,
189 |             "extension": extension,
190 |             "extracted_by_ocr": extracted_by_ocr,
191 |             "page_count": page_count,
192 |         }
193 |     )
194 | 
195 | 
196 | def make_png_thumbnail(request) -> HttpResponse:
197 |     """Make a thumbnail of the first page of a PDF and return it.
198 | 
199 |     :return: A response containing our file and any errors
200 |     :type: HTTPS response
201 |     """
202 |     form = ThumbnailForm(request.POST, request.FILES)
203 |     if not form.is_valid():
204 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
205 |     document = form.cleaned_data["file"]
206 |     with NamedTemporaryFile(suffix=".pdf") as tmp:
207 |         with open(tmp.name, "wb") as f:
208 |             f.write(document.read())
209 |         thumbnail, _, _ = make_png_thumbnail_for_instance(
210 |             tmp.name, form.cleaned_data["max_dimension"]
211 |         )
212 |         return HttpResponse(thumbnail)
213 | 
214 | 
215 | def make_png_thumbnails_from_range(request) -> HttpResponse:
216 |     """Make a zip file that contains a thumbnail for each page requested.
217 | 
218 |     :return: A response containing our zip and any errors
219 |     :type: HTTPS response
220 |     """
221 |     form = ThumbnailForm(request.POST, request.FILES)
222 |     if not form.is_valid():
223 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
224 | 
225 |     directory = TemporaryDirectory()
226 |     with NamedTemporaryFile(suffix=".pdf", mode="r+b") as temp_pdf:
227 |         temp_pdf.write(form.cleaned_data["file"].read())
228 | 
229 |         make_png_thumbnails(
230 |             temp_pdf.name,
231 |             form.cleaned_data["max_dimension"],
232 |             form.cleaned_data["pages"],
233 |             directory,
234 |         )
235 | 
236 |     with NamedTemporaryFile(suffix=".zip") as tmp_zip:
237 |         filename = shutil.make_archive(
238 |             f"{tmp_zip.name[:-4]}", "zip", directory.name
239 |         )
240 |         return FileResponse(open(filename, "rb"))
241 | 
242 | 
243 | def xray(request) -> JsonResponse:
244 |     """Check PDF for bad redactions
245 | 
246 |     :return: json with bounding boxes and text
247 |     """
248 |     try:
249 |         form = DocumentForm(request.POST, request.FILES)
250 |         if not form.is_valid():
251 |             return JsonResponse(
252 |                 {"error": True, "msg": "Failed validation"}, status=BAD_REQUEST
253 |             )
254 |         extension = form.cleaned_data["extension"]
255 |         if extension.casefold() != "pdf":
256 |             return JsonResponse(
257 |                 {"error": True, "msg": "Failed file type"}, status=BAD_REQUEST
258 |             )
259 |         results = get_xray(form.cleaned_data["fp"])
260 |         if results.get("error", False):
261 |             return JsonResponse(results, status=BAD_REQUEST)
262 |     except Exception:
263 |         pass
264 |     finally:
265 |         cleanup_form(form)
266 |     return JsonResponse({"error": False, "results": results})
267 | 
268 | 
269 | def page_count(request) -> HttpResponse:
270 |     """Get page count from PDF
271 | 
272 |     :return: Page count
273 |     """
274 |     form = DocumentForm(request.POST, request.FILES)
275 |     if not form.is_valid():
276 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
277 |     extension = form.cleaned_data["extension"]
278 |     pg_count = get_page_count(form.cleaned_data["fp"], extension)
279 |     cleanup_form(form)
280 |     return HttpResponse(pg_count)
281 | 
282 | 
283 | def extract_mime_type(request) -> JsonResponse | HttpResponse:
284 |     """Identify the mime type of a document
285 | 
286 |     :return: Mime type
287 |     """
288 |     form = DocumentForm(request.GET, request.FILES)
289 |     if not form.is_valid():
290 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
291 |     mime = form.cleaned_data["mime"]
292 |     mimetype = magic.from_file(form.cleaned_data["fp"], mime=mime)
293 |     cleanup_form(form)
294 |     return JsonResponse({"mimetype": mimetype})
295 | 
296 | 
297 | def extract_extension(request) -> HttpResponse:
298 |     """A handful of workarounds for getting extensions we can trust."""
299 |     form = MimeForm(request.GET, request.FILES)
300 |     if not form.is_valid():
301 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
302 |     content = form.cleaned_data["file"].read()
303 | 
304 |     file_str = magic.from_buffer(content)
305 |     if file_str.startswith("Composite Document File V2 Document"):
306 |         # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed
307 |         # in libmagic 5.11-2.
308 |         mime = "application/msword"
309 |     elif file_str == "(Corel/WP)":
310 |         mime = "application/vnd.wordperfect"
311 |     elif file_str == "C source, ASCII text":
312 |         mime = "text/plain"
313 |     elif file_str.startswith("WordPerfect document"):
314 |         mime = "application/vnd.wordperfect"
315 |     elif re.findall(
316 |         r"(Audio file with ID3.*MPEG.*layer III)|(.*Audio Media.*)", file_str
317 |     ):
318 |         mime = "audio/mpeg"
319 |     else:
320 |         # No workaround necessary
321 |         mime = magic.from_buffer(content, mime=True)
322 |     extension = mimetypes.guess_extension(mime)
323 |     if extension == ".obj":
324 |         # It could be a wpd, if it's not a PDF
325 |         if "PDF" in content[0:40]:
326 |             # Does 'PDF' appear in the beginning of the content?
327 |             extension = ".pdf"
328 |         else:
329 |             extension = ".wpd"
330 | 
331 |     # The extension is .bin, look in the content if we can infer the
332 |     # content type as pdf. See: https://bugs.astron.com/view.php?id=446
333 |     if extension == ".bin":
334 |         # Check if %PDF-X.X is in the first 1024 bytes of content
335 |         pattern = rb"%PDF-[0-9]+(\.[0-9]+)?"
336 |         matches = re.search(pattern, content[:1024])
337 |         if matches:
338 |             # Document contains a pdf version, so the file must be a pdf
339 |             extension = ".pdf"
340 | 
341 |     fixes = {
342 |         ".htm": ".html",
343 |         ".xml": ".html",
344 |         ".wsdl": ".html",
345 |         ".ksh": ".txt",
346 |         ".asf": ".wma",
347 |         ".dot": ".doc",
348 |     }
349 |     return HttpResponse(fixes.get(extension, extension).lower())
350 | 
351 | 
352 | def pdf_to_text(request) -> JsonResponse | HttpResponse:
353 |     """Extract text from text based PDFs immediately.
354 | 
355 |     :return:
356 |     """
357 |     form = DocumentForm(request.POST, request.FILES)
358 |     if not form.is_valid():
359 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
360 |     content, err, _ = make_pdftotext_process(form.cleaned_data["fp"])
361 |     cleanup_form(form)
362 |     return JsonResponse(
363 |         "content",
364 |         content,
365 |         "err",
366 |         err,
367 |     )
368 | 
369 | 
370 | def images_to_pdf(request) -> HttpResponse:
371 |     """
372 | 
373 |     :param request:
374 |     :return:
375 |     """
376 |     form = ImagePdfForm(request.GET)
377 |     if not form.is_valid():
378 |         raise BadRequest("Invalid form")
379 |     sorted_urls = form.cleaned_data["sorted_urls"]
380 | 
381 |     if len(sorted_urls) > 1:
382 |         image_list = download_images(sorted_urls)
383 |         with NamedTemporaryFile(suffix=".pdf") as tmp:
384 |             with open(tmp.name, "wb") as f:
385 |                 f.write(img2pdf.convert(image_list))
386 |             cleaned_pdf_bytes = strip_metadata_from_path(tmp.name)
387 |     else:
388 |         tiff_image = Image.open(
389 |             requests.get(sorted_urls[0], stream=True, timeout=60 * 5).raw
390 |         )
391 |         pdf_bytes = convert_tiff_to_pdf_bytes(tiff_image)
392 |         cleaned_pdf_bytes = strip_metadata_from_bytes(pdf_bytes)
393 |     return HttpResponse(cleaned_pdf_bytes, content_type="application/pdf")
394 | 
395 | 
396 | def fetch_audio_duration(request) -> HttpResponse:
397 |     """Fetch audio duration from file."""
398 |     try:
399 |         form = AudioForm(request.GET, request.FILES)
400 |         if not form.is_valid():
401 |             return HttpResponse("Failed validation", status=BAD_REQUEST)
402 |         with NamedTemporaryFile(suffix=".mp3") as tmp:
403 |             with open(tmp.name, "wb") as f:
404 |                 for chunk in form.cleaned_data["file"].chunks():
405 |                     f.write(chunk)
406 |             mp3_file = eyed3.load(tmp.name)
407 |             return HttpResponse(mp3_file.info.time_secs)
408 |     except Exception as e:
409 |         return HttpResponse(str(e))
410 | 
411 | 
412 | def convert_audio(request, output_format: str) -> FileResponse | HttpResponse:
413 |     """Converts an uploaded audio file to the specified output format and
414 |     updates its metadata.
415 | 
416 |     :return: Converted audio
417 |     """
418 |     form = AudioForm(request.GET, request.FILES)
419 |     if not form.is_valid():
420 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
421 |     filepath = form.cleaned_data["fp"]
422 |     media_file = form.cleaned_data["file"]
423 |     audio_data = {k: v[0] for k, v in dict(request.GET).items()}
424 |     match output_format:
425 |         case "mp3":
426 |             convert_to_mp3(filepath, media_file)
427 |             set_mp3_meta_data(audio_data, filepath)
428 |         case "ogg":
429 |             convert_to_ogg(filepath, media_file)
430 |         case _:
431 |             raise NotImplementedError
432 |     response = FileResponse(
433 |         open(filepath, "rb")  # noqa: SIM115 FileResponse closes the file
434 |     )
435 |     cleanup_form(form)
436 |     return response
437 | 
438 | 
439 | def embed_text(request) -> FileResponse | HttpResponse:
440 |     """Embed text onto an image PDF.
441 | 
442 |     :return: Embedded PDF
443 |     """
444 |     form = DocumentForm(request.GET, request.FILES)
445 |     if not form.is_valid():
446 |         return HttpResponse("Failed validation", status=BAD_REQUEST)
447 |     fp = form.cleaned_data["fp"]
448 |     with NamedTemporaryFile(suffix=".tiff") as destination:
449 |         rasterize_pdf(fp, destination.name)
450 |         data = pytesseract.image_to_data(
451 |             destination.name, output_type=Output.DICT
452 |         )
453 |         image = Image.open(destination.name)
454 |         w, h = image.width, image.height
455 |         output = PdfWriter()
456 |         with open(fp, "rb") as f:
457 |             existing_pdf = PdfReader(f)
458 |             for page in range(0, len(existing_pdf.pages)):
459 |                 packet = make_page_with_text(page + 1, data, h, w)
460 |                 new_pdf = PdfReader(packet)
461 |                 page = existing_pdf.pages[page]
462 |                 page.merge_page(new_pdf.pages[0])
463 |                 output.add_page(page)
464 | 
465 |         with NamedTemporaryFile(suffix=".pdf") as pdf_destination:
466 |             with open(pdf_destination.name, "wb") as outputStream:
467 |                 output.write(outputStream)
468 |             response = FileResponse(
469 |                 open(  # noqa: SIM115 FileResponse closes the file
470 |                     pdf_destination.name, "rb"
471 |                 )
472 |             )
473 |             cleanup_form(form)
474 |             return response
475 | 
476 | 
477 | def get_document_number(request) -> HttpResponse:
478 |     """Get PACER document number from PDF
479 | 
480 |     :param request: The request object
481 |     :return: PACER document number
482 |     """
483 | 
484 |     form = BaseFileForm(request.GET, request.FILES)
485 |     if not form.is_valid():
486 |         validation_message = form.errors.get_json_data()["__all__"][0][
487 |             "message"
488 |         ]
489 |         return HttpResponse(validation_message, status=BAD_REQUEST)
490 |     fp = form.cleaned_data["fp"]
491 |     document_number = get_document_number_from_pdf(fp)
492 |     cleanup_form(form)
493 |     return HttpResponse(document_number)
494 | 


--------------------------------------------------------------------------------
/doctor/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for doctor project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/4.0/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "doctor.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | def main():
 9 |     """Run administrative tasks."""
10 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "doctor.settings")
11 |     try:
12 |         from django.core.management import execute_from_command_line
13 |     except ImportError as exc:
14 |         raise ImportError(
15 |             "Couldn't import Django. Are you sure it's installed and "
16 |             "available on your PYTHONPATH environment variable? Did you "
17 |             "forget to activate a virtual environment?"
18 |         ) from exc
19 |     execute_from_command_line(sys.argv)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "doctor"
 3 | version = "1"
 4 | description = "CourtListener doctor"
 5 | requires-python = ">=3.10"
 6 | dependencies = [
 7 |   "certifi",
 8 |   "chardet>=3.0.4",
 9 |   "django>=3.2,<4",
10 |   "django-environ>=0.8.1",
11 |   "eyed3",
12 |   "gunicorn==20.1",
13 |   "idna==2.10",
14 |   "img2pdf",
15 |   "lxml>=4.5.2",
16 |   "lxml-html-clean",
17 |   "numpy>=1.19.1",
18 |   "opencv-python>=4.2.0.32",
19 |   "pandas>=1.1.1",
20 |   "pdf2image>=1.7.1",
21 |   "pdfplumber",
22 |   "pillow>=8.0.1",
23 |   "pkginfo==1.5.0.1",
24 |   "pypdf2[crypto]",
25 |   "pytesseract>=0.3.5",
26 |   "python-magic",
27 |   "reportlab",
28 |   "requests>=2.25",
29 |   "seal-rookery>=2.2.1",
30 |   "sentry-sdk",
31 |   "six>=1.15",
32 |   "urllib3>=1.25.10",
33 |   "x-ray==0.3.3",
34 | ]
35 | 
36 | [dependency-groups]
37 | dev = [
38 |   "ipython",
39 | ]
40 | 
41 | [tool.ruff]
42 | line-length = 79
43 | lint.select = [
44 |   # flake8-bugbear
45 |   "B",
46 |   # flake8-comprehensions
47 |   "C4",
48 |   # pycodestyle
49 |   "E",
50 |   # Pyflakes errors
51 |   "F",
52 |   # isort
53 |   "I",
54 |   # flake8-simplify
55 |   "SIM",
56 |   # flake8-tidy-imports
57 |   "TID",
58 |   # pyupgrade
59 |   "UP",
60 |   # Pyflakes warnings
61 |   "W",
62 | ]
63 | lint.ignore = [
64 |   # flake8-bugbear opinionated rules
65 |   "B9",
66 |   # line-too-long
67 |   "E501",
68 |   # suppressible-exception
69 |   "SIM105",
70 |   # if-else-block-instead-of-if-exp
71 |   "SIM108",
72 | ]
73 | 


--------------------------------------------------------------------------------