├── .gitattributes
├── .github
    └── workflows
    │   └── build-sphinx.yml
├── .gitignore
├── .graphics
    ├── all-results-example.png
    ├── azure-create-computer-vision.png
    ├── azure-create-resource.png
    ├── azure-dashboard.png
    ├── azure-get-keys.png
    ├── azure-no-resources.png
    ├── azure-start-free-account.png
    ├── caltech-round.png
    ├── caltech-round.svg
    ├── clara-barton-page.jpg
    ├── clara-barton-page.png
    ├── example-tsv-file.png
    ├── glaser-example-google.jpg
    ├── glaser-example-google.png
    ├── google-api-create-credentials-button.png
    ├── google-api-credentials-menu.png
    ├── google-bounding-box-example.png
    ├── google-cloud-add-api.png
    ├── google-cloud-new-project.png
    ├── google-cloud-platform-api-option.png
    ├── google-cloud-platform.png
    ├── google-create-credentials.png
    ├── google-create-oauth-client-id.png
    ├── google-create-service-account-key.png
    ├── google-create-service-account.png
    ├── google-download-json.png
    ├── google-gcp-add-billing-account.png
    ├── google-gcp-billing.png
    ├── google-gcp-create-project.png
    ├── google-gcp-create-service-account-key.png
    ├── google-gcp-free.png
    ├── google-gcp-new-project.png
    ├── google-gcp-no-billing.png
    ├── google-gcp-role.png
    ├── google-oauth-consent-info.png
    ├── google-oauth-create-client-id.png
    ├── google-select-project.png
    ├── google-service-account-info.png
    ├── handprint-asciinema.png
    ├── handprint-demo.gif
    ├── handprint-preview-image.png
    ├── linux-32.png
    ├── mac-os-32.png
    ├── noun_Hand_733265.png
    ├── noun_Hand_733265.svg
    ├── os-windows-32.png
    └── sample-annotated-image.png
├── CHANGES.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── Pipfile
├── Pipfile.lock
├── README-PAPER.md
├── README.md
├── SUPPORT.md
├── bin
    └── handprint
├── codemeta.json
├── dev
    ├── icon
    │   ├── README.md
    │   ├── handprint-icon-white.png
    │   ├── handprint-icon-white.svg
    │   ├── handprint-icon.svg
    │   └── noun_Hand_733265.svg
    ├── scripts
    │   └── create-pyz
    └── services
    │   ├── google
    │       ├── Detect handwriting in images - Cloud Vision API - Google Cloud.pdf
    │       ├── Detect text in images - Cloud Vision API - Google Cloud.pdf
    │       ├── Document Text Tutorial - Cloud Vision API - Google Cloud.pdf
    │       ├── Google-vision-notes.md
    │       ├── Package google.cloud.vision.v1p4beta1 - Cloud Vision API.pdf
    │       ├── README.md
    │       ├── Types for Google Cloud Vision v1 API - google-cloud-vision documentation.pdf
    │       ├── google cloud vision api feature types 2018-10-25.pdf
    │       └── google.cloud.vision_v1.types.image_annotator - google-cloud-vision documentation.pdf
    │   └── sample-output
    │       ├── README.md
    │       ├── dag-304-DAG_1_1_8_0029.handprint-all.png
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.json
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.png
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.txt
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-textract.json
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-textract.png
    │       ├── dag-304-DAG_1_1_8_0029.handprint-amazon-textract.txt
    │       ├── dag-304-DAG_1_1_8_0029.handprint-google.json
    │       ├── dag-304-DAG_1_1_8_0029.handprint-google.png
    │       ├── dag-304-DAG_1_1_8_0029.handprint-google.txt
    │       ├── dag-304-DAG_1_1_8_0029.handprint-microsoft.json
    │       ├── dag-304-DAG_1_1_8_0029.handprint-microsoft.png
    │       ├── dag-304-DAG_1_1_8_0029.handprint-microsoft.txt
    │       └── dag-304-DAG_1_1_8_0029.jp2
├── docs
    ├── .nojekyll
    ├── Makefile
    ├── README.md
    ├── _static
    │   ├── css
    │   │   └── custom.css
    │   ├── media
    │   │   ├── all-results-example.png
    │   │   ├── clara-barton-page.jpg
    │   │   ├── favicon.ico
    │   │   ├── glaser-example-google.jpg
    │   │   ├── handprint-icon-white.png
    │   │   ├── handprint-icon-white.svg
    │   │   └── handprint-icon.svg
    │   └── versions.json
    ├── advanced-usage.md
    ├── basic-usage.md
    ├── colophon.md
    ├── command-summary.md
    ├── conf.py
    ├── configuration.md
    ├── index.md
    ├── installation.md
    └── known-issues.md
├── handprint
    ├── __init__.py
    ├── __main__.py
    ├── comparison.py
    ├── credentials
    │   ├── __init__.py
    │   ├── amazon_auth.py
    │   ├── base.py
    │   ├── credentials_files.py
    │   ├── google_auth.py
    │   └── microsoft_auth.py
    ├── exceptions.py
    ├── exit_codes.py
    ├── images.py
    ├── main_body.py
    ├── manager.py
    └── services
    │   ├── __init__.py
    │   ├── amazon.py
    │   ├── base.py
    │   ├── google.py
    │   └── microsoft.py
├── pubs
    └── joss
    │   ├── .gitignore
    │   ├── H96566k.handprint-all.png
    │   ├── Makefile
    │   ├── README.md
    │   ├── notes.txt
    │   ├── paper.bib
    │   └── paper.md
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── data
        ├── README.md
        ├── bad-images
        │   └── corrupted-image.png
        ├── caltech-archives
        │   ├── caltech-archives-urls.txt
        │   └── glaser
        │   │   ├── DAG_5_1_6 1952-1957 Notebook VI p2.jpg
        │   │   ├── DAG_5_2_1 1950-1953 notebook VIII p7.jpg
        │   │   ├── README.md
        │   │   ├── dag-285-DAG_1_1_8_0003.jp2
        │   │   └── dag-304-DAG_1_1_8_0029.jp2
        ├── fragments
        │   ├── f1.png
        │   ├── f2.png
        │   ├── f6.png
        │   └── f7.png
        └── public-domain
        │   ├── LOC-urls.txt
        │   └── images
        │       ├── AGBell_Notebook.jpg
        │       ├── H96566k.jpg
        │       ├── README.md
        │       ├── clara-barton-life-of-my-childhood-p90.jpg
        │       └── mabel-h-bell-to-eliza-s-bell-sept-28-1879-1-1600.jpg
    ├── test_comparison.py
    ├── test_exceptions.py
    ├── test_exit_codes.py
    └── test_images.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # -*- mode: sh; -*-
 2 | 
 3 | # Set the default behavior, in case people don't have core.autocrlf set.
 4 | # .............................................................................
 5 | 
 6 | * text=auto
 7 | 
 8 | # Specify what's text and should be normalized.
 9 | # .............................................................................
10 | 
11 | *.py text
12 | *.in text
13 | *.rst text
14 | *.cfg text
15 | *.ini text
16 | *.yml text
17 | *.json text
18 | *.bat text
19 | *.sh text
20 | LICENSE text
21 | CONTRIBUTING text
22 | 
23 | # Denote all files that are truly binary and should not be modified.
24 | # .............................................................................
25 | 
26 | *.png binary
27 | *.jpg binary
28 | *.xls binary
29 | *.doc binary
30 | 
31 | # This next one is because in other projects, we've had problems with git
32 | # getting confused about line endings when people using Windows and Mac edit
33 | # the same files.
34 | # .............................................................................
35 | 
36 | *.csv binary diff=csv
37 | 


--------------------------------------------------------------------------------
/.github/workflows/build-sphinx.yml:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # @file    build-myst.yml
 3 | # @brief   GitHub Actions workflow to build Handprint docs using Sphinx + MyST
 4 | # @author  Michael Hucka <mhucka@caltech.edu>
 5 | # @license Please see the file named LICENSE in the project directory
 6 | # @website https://github.com/caltechlibrary/handprint
 7 | #
 8 | # This workflow file was originally based on work by GitHub user "peaceiris":
 9 | # https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-static-site-generators-with-python
10 | # =============================================================================
11 | 
12 | name: Update docs on GitHub
13 | 
14 | on:
15 |   push:
16 |     branches:
17 |       - main
18 | 
19 | jobs:
20 |   deploy:
21 |     runs-on: ubuntu-18.04
22 |     steps:
23 |       - uses: actions/checkout@v2
24 | 
25 |       - name: Setup Python
26 |         uses: actions/setup-python@v2
27 |         with:
28 |           python-version: '3.8'
29 | 
30 |       - name: Upgrade pip
31 |         run: |
32 |           # install pip=>20.1 to use "pip cache dir"
33 |           python3 -m pip install --upgrade pip
34 | 
35 |       - name: Get pip cache dir
36 |         id: pip-cache
37 |         run: echo "::set-output name=dir::$(pip cache dir)"
38 | 
39 |       - name: Cache dependencies
40 |         uses: actions/cache@v2
41 |         with:
42 |           path: ${{ steps.pip-cache.outputs.dir }}
43 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
44 |           restore-keys: |
45 |             ${{ runner.os }}-pip-
46 | 
47 |       - name: Install dependencies
48 |         run: |
49 |           python3 -m pip install "myst-parser[linkify]"
50 |           python3 -m pip install sphinx-material
51 |           python3 -m pip install sphinxcontrib-mermaid
52 | 
53 |       - name: Build
54 |         run: |
55 |           cd docs
56 |           make html
57 | 
58 |       - name: Deploy
59 |         uses: peaceiris/actions-gh-pages@v3
60 |         with:
61 |           github_token: ${{ secrets.GITHUB_TOKEN }}
62 |           publish_dir: ./docs/_build/html
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # -*- mode: sh-mode; -*-
 2 | 
 3 | # Compiled source
 4 | # .............................................................................
 5 | 
 6 | *.com
 7 | *.class
 8 | *.dll
 9 | *.exe
10 | *.o
11 | *.so
12 | 
13 | # Compressed archives & package files
14 | # .............................................................................
15 | 
16 | *.7z
17 | *.dmg
18 | *.gz
19 | *.iso
20 | *.jar
21 | *.rar
22 | *.tar
23 | *.zip
24 | 
25 | # OS-specific things to ignore:
26 | # .............................................................................
27 | 
28 | .DS_Store
29 | .DS_Store?
30 | ._*
31 | .Spotlight-V100
32 | .Trashes
33 | ehthumbs.db
34 | Thumbs.db
35 | 
36 | # Emacs-specific things to ignore:
37 | # .............................................................................
38 | 
39 | *~
40 | .#*
41 | .git/COMMIT_EDITMSG
42 | auto
43 | *.synctex.gz
44 | TAGS
45 | 
46 | # LaTeX-specific things to ignore:
47 | # .............................................................................
48 | 
49 | *.aux
50 | *.log
51 | *.bbl
52 | *.blg
53 | *.out
54 | *.toc
55 | *.loc
56 | *.mp
57 | 
58 | # Python-specific things to ignore:
59 | # .............................................................................
60 | 
61 | *.pyc
62 | __pycache__
63 | *.egg-info
64 | .eggs
65 | 
66 | # Project-specific things to ignore:
67 | # .............................................................................
68 | 
69 | ./build
70 | ./dist
71 | *-reduced.jpg
72 | *-reduced.png
73 | *.all-results.jpg
74 | *.all-results.png
75 | *.amazon-rekognition.jpg
76 | *.amazon-textract.jpg
77 | *.microsoft.jpg
78 | *.google.jpg
79 | *.amazon-rekognition.png
80 | *.amazon-textract.png
81 | *.microsoft.png
82 | *.google.png
83 | *.bak
84 | .README.md.tmp
85 | docs/_build
86 | 


--------------------------------------------------------------------------------
/.graphics/all-results-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/all-results-example.png


--------------------------------------------------------------------------------
/.graphics/azure-create-computer-vision.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-create-computer-vision.png


--------------------------------------------------------------------------------
/.graphics/azure-create-resource.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-create-resource.png


--------------------------------------------------------------------------------
/.graphics/azure-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-dashboard.png


--------------------------------------------------------------------------------
/.graphics/azure-get-keys.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-get-keys.png


--------------------------------------------------------------------------------
/.graphics/azure-no-resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-no-resources.png


--------------------------------------------------------------------------------
/.graphics/azure-start-free-account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/azure-start-free-account.png


--------------------------------------------------------------------------------
/.graphics/caltech-round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/caltech-round.png


--------------------------------------------------------------------------------
/.graphics/clara-barton-page.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/clara-barton-page.jpg


--------------------------------------------------------------------------------
/.graphics/clara-barton-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/clara-barton-page.png


--------------------------------------------------------------------------------
/.graphics/example-tsv-file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/example-tsv-file.png


--------------------------------------------------------------------------------
/.graphics/glaser-example-google.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/glaser-example-google.jpg


--------------------------------------------------------------------------------
/.graphics/glaser-example-google.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/glaser-example-google.png


--------------------------------------------------------------------------------
/.graphics/google-api-create-credentials-button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-api-create-credentials-button.png


--------------------------------------------------------------------------------
/.graphics/google-api-credentials-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-api-credentials-menu.png


--------------------------------------------------------------------------------
/.graphics/google-bounding-box-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-bounding-box-example.png


--------------------------------------------------------------------------------
/.graphics/google-cloud-add-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-cloud-add-api.png


--------------------------------------------------------------------------------
/.graphics/google-cloud-new-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-cloud-new-project.png


--------------------------------------------------------------------------------
/.graphics/google-cloud-platform-api-option.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-cloud-platform-api-option.png


--------------------------------------------------------------------------------
/.graphics/google-cloud-platform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-cloud-platform.png


--------------------------------------------------------------------------------
/.graphics/google-create-credentials.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-create-credentials.png


--------------------------------------------------------------------------------
/.graphics/google-create-oauth-client-id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-create-oauth-client-id.png


--------------------------------------------------------------------------------
/.graphics/google-create-service-account-key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-create-service-account-key.png


--------------------------------------------------------------------------------
/.graphics/google-create-service-account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-create-service-account.png


--------------------------------------------------------------------------------
/.graphics/google-download-json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-download-json.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-add-billing-account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-add-billing-account.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-billing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-billing.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-create-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-create-project.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-create-service-account-key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-create-service-account-key.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-free.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-free.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-new-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-new-project.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-no-billing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-no-billing.png


--------------------------------------------------------------------------------
/.graphics/google-gcp-role.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-gcp-role.png


--------------------------------------------------------------------------------
/.graphics/google-oauth-consent-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-oauth-consent-info.png


--------------------------------------------------------------------------------
/.graphics/google-oauth-create-client-id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-oauth-create-client-id.png


--------------------------------------------------------------------------------
/.graphics/google-select-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-select-project.png


--------------------------------------------------------------------------------
/.graphics/google-service-account-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/google-service-account-info.png


--------------------------------------------------------------------------------
/.graphics/handprint-asciinema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/handprint-asciinema.png


--------------------------------------------------------------------------------
/.graphics/handprint-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/handprint-demo.gif


--------------------------------------------------------------------------------
/.graphics/handprint-preview-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/handprint-preview-image.png


--------------------------------------------------------------------------------
/.graphics/linux-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/linux-32.png


--------------------------------------------------------------------------------
/.graphics/mac-os-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/mac-os-32.png


--------------------------------------------------------------------------------
/.graphics/noun_Hand_733265.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/noun_Hand_733265.png


--------------------------------------------------------------------------------
/.graphics/noun_Hand_733265.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px"><title>Shape</title><desc>Created with Sketch.</desc><g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g transform="translate(-926.000000, -136.000000)" fill="#000000"><path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z" style="fill: rgb(255, 110, 30);"/></g></g></svg>


--------------------------------------------------------------------------------
/.graphics/os-windows-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/os-windows-32.png


--------------------------------------------------------------------------------
/.graphics/sample-annotated-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/.graphics/sample-annotated-image.png


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | authors: 
 4 |   -
 5 |     affiliation: "Caltech Library"
 6 |     given-names: Michael
 7 |     family-names: Hucka
 8 |     orcid: "0000-0001-9105-5960"
 9 | cff-version: "1.1.0"
10 | message: "If you use this software, please cite it using these metadata."
11 | repository-code: "https://github.com/caltechlibrary/handprint"
12 | title: "Handprint: Run handwritten text recognition services on images of documents"
13 | date-released: 2022-06-24
14 | version: "1.6.0"
15 | doi: 10.22002/20207
16 | keywords: 
17 |   - handwritten text recognition
18 |   - optical character recognition
19 |   - machine learning
20 |   - artificial intelligence
21 |   - cloud services
22 |   - document processing
23 | ...
24 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | Contributor Covenant Code of Conduct
 2 | ====================================
 3 | 
 4 | ## Our Pledge
 5 | 
 6 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 7 | 
 8 | ## Our Standards
 9 | 
10 | Examples of behavior that contributes to creating a positive environment include:
11 | 
12 | * Using welcoming and inclusive language
13 | * Being respectful of differing viewpoints and experiences
14 | * Gracefully accepting constructive criticism
15 | * Focusing on what is best for the community
16 | * Showing empathy towards other community members
17 | 
18 | Examples of unacceptable behavior by participants include:
19 | 
20 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
21 | * Trolling, insulting/derogatory comments, and personal or political attacks
22 | * Public or private harassment
23 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
24 | * Other conduct which could reasonably be considered inappropriate in a professional setting
25 | 
26 | ## Our Responsibilities
27 | 
28 | Project contributors are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
29 | 
30 | Project contributors have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
31 | 
32 | ## Scope
33 | 
34 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project contributors.
35 | 
36 | ## Enforcement
37 | 
38 | If a contributor engages in harassing behaviour, the project organizers may take any action they deem appropriate, including warning the offender or expelling them from online forums, online project resources, face-to-face meetings, or any other project-related activity or resource.
39 | 
40 | If you are being harassed, notice that someone else is being harassed, or have any other concerns, please contact a member of the project team immediately.  Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team.  All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances.  The project team is obligated to maintain confidentiality with regard to the reporter of an incident.  Further details of specific enforcement policies may be posted separately.
41 | 
42 | ## Attribution
43 | 
44 | Portions of this Code of Conduct were adapted from Electron's [Contributor Covenant Code of Conduct](https://github.com/electron/electron/blob/master/CODE_OF_CONDUCT.md), which itself was adapted from the [Contributor Covenant](http://contributor-covenant.org/version/1/4), version 1.4.
45 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Guidelines for contributing to this project
 2 | 
 3 | Any constructive contributions &ndash; bug reports, pull requests (code or documentation), suggestions for improvements, and more &ndash; are welcome.
 4 | 
 5 | ## Conduct
 6 | 
 7 | Everyone is asked to read and respect the [code of conduct](CODE_OF_CONDUCT.md) before participating in this project.
 8 | 
 9 | ## Coordinating work
10 | 
11 | A quick way to find out what is currently on people's plates and our near-term plans is to look at the [GitHub issue tracker](https://github.com/caltechlibrary/handprint/issues) for this project, but the possibilities are not limited to what you see there &ndash; if you have ideas for new features and enhancements, please feel free to write them up as a new issue or contact the developers directly!
12 | 
13 | ## Submitting contributions
14 | 
15 | Please feel free to contact the author directly, or even better, jump right in and use the standard GitHub approach of forking the repo and creating a pull request.  When committing code changes and submitting pull requests, please write a clear log message for your commits.  One-line messages are fine for small changes, but bigger changes should look like this:
16 | 
17 |     $ git commit -m "A brief summary of the commit
18 |     > 
19 |     > A paragraph describing what changed and its impact."
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018-2022, Caltech
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice,
 7 | this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors
14 | may be used to endorse or promote products derived from this software without
15 | specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 | POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # =============================================================================
  2 | # @file    Makefile
  3 | # @brief   Makefile for some steps in creating new releases on GitHub
  4 | # @author  Michael Hucka
  5 | # @date    2020-08-11
  6 | # @license Please see the file named LICENSE in the project directory
  7 | # @website https://github.com/caltechlibrary/handprint
  8 | # =============================================================================
  9 | 
 10 | .ONESHELL: 				# Run all commands in the same shell.
 11 | .SHELLFLAGS += -e			# Exit at the first error.
 12 | 
 13 | # Before we go any further, test if certain programs are available.
 14 | # The following is based on the approach posted by Jonathan Ben-Avraham to
 15 | # Stack Overflow in 2014 at https://stackoverflow.com/a/25668869
 16 | 
 17 | PROGRAMS_NEEDED = curl gh git jq sed
 18 | TEST := $(foreach p,$(PROGRAMS_NEEDED),\
 19 | 	  $(if $(shell which $(p)),_,$(error Cannot find program "$(p)")))
 20 | 
 21 | 
 22 | # Gather values that we need ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 23 | 
 24 | $(info Gathering data -- this takes a few moments ...)
 25 | 
 26 | name	:= $(strip $(shell grep -m 1 'name\s*=' setup.cfg	  | cut -f2 -d'='))
 27 | version	:= $(strip $(shell grep -m 1 'version\s*=' setup.cfg	  | cut -f2 -d'='))
 28 | url	:= $(strip $(shell grep -m 1 'url\s*=' setup.cfg	  | cut -f2 -d'='))
 29 | desc	:= $(strip $(shell grep -m 1 'description\s*=' setup.cfg  | cut -f2 -d'='))
 30 | author	:= $(strip $(shell grep -m 1 'author\s*=' setup.cfg	  | cut -f2 -d'='))
 31 | email	:= $(strip $(shell grep -m 1 'author_email\s*=' setup.cfg | cut -f2 -d'='))
 32 | license	:= $(strip $(shell grep -m 1 'license\s*=' setup.cfg      | cut -f2 -d'='))
 33 | 
 34 | branch	  := $(shell git rev-parse --abbrev-ref HEAD)
 35 | repo	  := $(strip $(shell gh repo view | head -1 | cut -f2 -d':'))
 36 | id	  := $(shell curl -s https://api.github.com/repos/$(repo) | jq '.id')
 37 | id_url	  := https://data.caltech.edu/badge/latestdoi/$(id)
 38 | doi_url	  := $(shell curl -sILk $(id_url) | grep Locat | cut -f2 -d' ')
 39 | doi	  := $(subst https://doi.org/,,$(doi_url))
 40 | doi_tail  := $(lastword $(subst ., ,$(doi)))
 41 | init_file := $(name)/__init__.py
 42 | 
 43 | $(info Gathering data ... Done.)
 44 | 
 45 | 
 46 | # make release ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 47 | 
 48 | release: | test-branch release-on-github print-instructions
 49 | 
 50 | test-branch:
 51 | ifneq ($(branch),main)
 52 | 	$(error Current git branch != main. Merge changes into main first)
 53 | endif
 54 | 
 55 | update-init:;
 56 | 	@sed -i .bak -e "s|^\(__version__ *=\).*|\1 '$(version)'|"  $(init_file)
 57 | 	@sed -i .bak -e "s|^\(__description__ *=\).*|\1 '$(desc)'|" $(init_file)
 58 | 	@sed -i .bak -e "s|^\(__url__ *=\).*|\1 '$(url)'|"	    $(init_file)
 59 | 	@sed -i .bak -e "s|^\(__author__ *=\).*|\1 '$(author)'|"    $(init_file)
 60 | 	@sed -i .bak -e "s|^\(__email__ *=\).*|\1 '$(email)'|"	    $(init_file)
 61 | 	@sed -i .bak -e "s|^\(__license__ *=\).*|\1 '$(license)'|"  $(init_file)
 62 | 
 63 | update-codemeta:;
 64 | 	@sed -i .bak -e "/version/ s/[0-9].[0-9][0-9]*.[0-9][0-9]*/$(version)/" codemeta.json
 65 | 
 66 | update-citation:;
 67 | 	$(eval date  := $(shell date "+%F"))
 68 | 	@sed -i .bak -e "/^date-released/ s/[0-9][0-9-]*/$(date)/" CITATION.cff
 69 | 	@sed -i .bak -e "/^version/ s/[0-9].[0-9][0-9]*.[0-9][0-9]*/$(version)/" CITATION.cff
 70 | 
 71 | edited := codemeta.json $(init_file) CITATION.cff
 72 | 
 73 | commit-updates:;
 74 | 	git add $(edited)
 75 | 	git diff-index --quiet HEAD $(edited) || \
 76 | 	    git commit -m"Update stored version number" $(edited)
 77 | 
 78 | release-on-github: | update-init update-codemeta commit-updates
 79 | 	$(eval tmp_file  := $(shell mktemp /tmp/release-notes-$(name).XXXX))
 80 | 	git push -v --all
 81 | 	git push -v --tags
 82 | 	$(info ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓)
 83 | 	$(info ┃ Write release notes in the file that gets opened in your  ┃)
 84 | 	$(info ┃ editor. Close the editor to complete the release process. ┃)
 85 | 	$(info ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛)
 86 | 	sleep 2
 87 | 	$(EDITOR) $(tmp_file)
 88 | 	gh release create v$(version) -t "Release $(version)" -F $(tmp_file)
 89 | 
 90 | print-instructions:;
 91 | 	$(info ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓)
 92 | 	$(info ┃ Next steps:                                             ┃)
 93 | 	$(info ┃ 1. Visit https://github.com/$(repo)/releases )
 94 | 	$(info ┃ 2. Check the release                                    ┃)
 95 | 	$(info ┃ 3. Wait a few seconds to let web services do their work ┃)
 96 | 	$(info ┃ 4. Run "make update-doi" to update the DOI in README.md ┃)
 97 | 	$(info ┃ 5. Run "make packages" & check the results              ┃)
 98 | 	$(info ┃ 6. Run "make test-pypi" to push to test.pypi.org        ┃)
 99 | 	$(info ┃ 7. Check https://test.pypi.org/$(repo) )
100 | 	$(info ┃ 8. Run "make pypi" to push to pypi for real             ┃)
101 | 	$(info ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛)
102 | 	@echo ""
103 | 
104 | update-doi: 
105 | 	sed -i .bak -e 's|/api/record/[0-9]\{1,\}|/api/record/$(doi_tail)|' README.md
106 | 	sed -i .bak -e 's|edu/records/[0-9]\{1,\}|edu/records/$(doi_tail)|' README.md
107 | 	sed -i .bak -e '/doi:/ s|10.22002/[0-9]\{1,\}|10.22002/$(doi_tail)|' CITATION.cff
108 | 	git add README.md CITATION.cff
109 | 	git diff-index --quiet HEAD README.md || \
110 | 	    (git commit -m"Update DOI" README.md && git push -v --all)
111 | 	git diff-index --quiet HEAD CITATION.cff || \
112 | 	    (git commit -m"Update DOI" CITATION.cff && git push -v --all)
113 | 
114 | packages: clean
115 | 	python3 setup.py sdist bdist_wheel
116 | 	python3 -m twine check dist/*
117 | 
118 | test-pypi: packages
119 | 	python3 -m twine upload --repository testpypi dist/$(name)-$(version)*.{whl,gz}
120 | 
121 | pypi: packages
122 | 	python3 -m twine upload dist/*
123 | 
124 | 
125 | # make executables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126 | 
127 | binaries binary: 
128 | 	mkdir -p dist/binary
129 | 	dev/scripts/create-pyz dist/binary 3.8.2
130 | 	dev/scripts/create-pyz dist/binary 3.9.2
131 | 	dev/scripts/create-pyz dist/binary 3.10.0
132 | 
133 | 
134 | # Cleanup and miscellaneous directives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
135 | 
136 | clean: clean-dist clean-build clean-release clean-other
137 | 
138 | clean-dist:;
139 | 	-rm -fr dist/$(name) dist/$(name)-$(version).tar.gz \
140 | 	    dist/$(name)-$(version)-py3-none-any.whl dist/binary \
141 |             __pycache__ .eggs
142 | 
143 | clean-build:;
144 | 	-rm -rf build
145 | 
146 | clean-release:;
147 | 	-rm -rf $(name).egg-info codemeta.json.bak $(init_file).bak README.md.bak
148 | 
149 | clean-other:;
150 | 	-rm -fr $(name)/__pycache__
151 | 
152 | .PHONY: release release-on-github update-init update-codemeta \
153 | 	print-instructions packages clean test-pypi pypi
154 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | aenum = "==3.1.0"
 8 | appdirs = "==1.4.4"
 9 | boltons = "==21.0.0"
10 | boto3 = "==1.17.91"
11 | bun = "==0.0.7"
12 | commonpy = ">=1.9.0"
13 | fastnumbers = "==3.1.0"
14 | google-api-core = "==1.30.0"
15 | google-api-python-client = "==2.8.0"
16 | google-auth = "==1.30.2"
17 | google-auth-httplib2 = "==0.1.0"
18 | google-cloud = "==0.34.0"
19 | google-cloud-vision = "==2.3.1"
20 | googleapis-common-protos = "==1.53.0"
21 | grpcio = "==1.38.0"
22 | humanize = ">=3.7.1"
23 | imagesize = "==1.2.0"
24 | matplotlib = "==3.4.2"
25 | numpy = "==1.22.2"
26 | plac = "==1.3.3"
27 | psutil = "==5.8.0"
28 | requests = "==2.25.0"
29 | rich = "==10.1.0"
30 | sidetrack = "==2.0.0"
31 | textdistance = "==4.2.2"
32 | urllib3 = "==1.26.5"
33 | validator-collection = "==1.5.0"
34 | Pillow = "==9.0.1"
35 | PyMuPDF = "==1.19.6"
36 | StringDist = "==1.0.9"
37 | 
38 | [dev-packages]
39 | pytest-mock = "==3.7.0"
40 | 


--------------------------------------------------------------------------------
/README-PAPER.md:
--------------------------------------------------------------------------------
1 | # About the JOSS paper
2 | 
3 | This repository uses a separate branch named `joss-paper` to store the files for the Handprint JOSS paper submission made in March 2022. That branch has a separate change history and different file structure. Changes to the paper are not reflected in the change history of the `main` or `develop` branches of the repository.
4 | 
5 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | Support
2 | =======
3 | 
4 | Thank you for your interest in this project.  If you are experiencing problems or have questions, the following are the preferred methods of reaching someone:
5 | 
6 | 1. Report a new issue using the [issue tracker](https://github.com/caltechlibrary/handprint/issues).
7 | 2. Send email to the Caltech Library: [helpdesk@library.caltech.edu](mailto:helpdesk@library.caltech.edu).
8 | 3. Send email to an individual involved in the project. People's names appear in the top-level `README.md` file in the source code repository.
9 | 


--------------------------------------------------------------------------------
/bin/handprint:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # =============================================================================
 3 | # @file    handprint
 4 | # @brief   Simple interface to run handprint, for testing and exploration
 5 | # @author  Michael Hucka <mhucka@caltech.edu>
 6 | # @license Please see the file named LICENSE in the project directory
 7 | # @website https://github.com/caltechlibrary/handprint
 8 | # =============================================================================
 9 | 
10 | # Allow this program to be executed directly from the 'bin' directory.
11 | import os
12 | import sys
13 | import plac
14 | 
15 | # Allow this program to be executed directly from the 'bin' directory.
16 | try:
17 |     thisdir = os.path.dirname(os.path.abspath(__file__))
18 |     sys.path.append(os.path.join(thisdir, '..'))
19 | except:
20 |     sys.path.append('..')
21 | 
22 | # Hand over to the command line interface.
23 | import handprint
24 | from handprint.__main__ import main as main
25 | 
26 | if __name__ == "__main__":
27 |     plac.call(main)
28 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 3 |   "@type": "SoftwareSourceCode",
 4 |   "description": "Apply different handwritten text recognition services and algorithms to handwritten documents.",
 5 |   "name": "Handprint",
 6 |   "codeRepository": "https://github.com/caltechlibrary/handprint",
 7 |   "issueTracker": "https://github.com/caltechlibrary/handprint/issues",
 8 |   "license": "https://github.com/caltechlibrary/handprint/blob/master/LICENSE",
 9 |   "version": "1.6.0",
10 |   "author": [
11 |     {
12 |       "@type": "Person",
13 |       "givenName": "Michael",
14 |       "familyName": "Hucka",
15 |       "affiliation": "Caltech Library",
16 |       "email": "mhucka@caltech.edu",
17 |       "@id": "https://orcid.org/0000-0001-9105-5960"
18 |     }],
19 |   "developmentStatus": "active",
20 |   "keywords": [
21 |     "handwritten text recognition",
22 |     "HTR",
23 |     "OCR",
24 |     "machine learning"
25 |   ],
26 |   "maintainer": "https://orcid.org/0000-0001-9105-5960",
27 |   "programmingLanguage": "Python"
28 | }
29 | 


--------------------------------------------------------------------------------
/dev/icon/README.md:
--------------------------------------------------------------------------------
1 | The [vector artwork](https://thenounproject.com/search/?q=733265&i=733265) used as a starting point for the logo for this repository was created by ["Kevin"](https://thenounproject.com/kevn/) for the [Noun Project](https://thenounproject.com).  It is licensed under the Creative Commons [Attribution 3.0 Unported](https://creativecommons.org/licenses/by/3.0/deed.en) license.  The vector graphics was modified by Mike Hucka to change the color.
2 | 


--------------------------------------------------------------------------------
/dev/icon/handprint-icon-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/icon/handprint-icon-white.png


--------------------------------------------------------------------------------
/dev/icon/handprint-icon-white.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <svg viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px" xmlns="http://www.w3.org/2000/svg">
 3 |   <title>Shape</title>
 4 |   <desc>Created with Sketch.</desc>
 5 |   <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 6 |     <g transform="translate(-926.000000, -136.000000)" fill="#000000">
 7 |       <path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z" style="fill: rgb(255, 255, 255);"/>
 8 |     </g>
 9 |   </g>
10 | </svg>


--------------------------------------------------------------------------------
/dev/icon/handprint-icon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px"><title>Shape</title><desc>Created with Sketch.</desc><g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g transform="translate(-926.000000, -136.000000)" fill="#000000"><path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z" style="fill: rgb(255, 110, 30);"/></g></g></svg>


--------------------------------------------------------------------------------
/dev/icon/noun_Hand_733265.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px"><title>Shape</title><desc>Created with Sketch.</desc><g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g transform="translate(-926.000000, -136.000000)" fill="#000000"><path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z"/></g></g><text x="0" y="106" fill="#000000" font-size="5px" font-weight="bold" font-family="'Helvetica Neue', Helvetica, Arial-Unicode, Arial, Sans-serif">Created by Kevin</text><text x="0" y="111" fill="#000000" font-size="5px" font-weight="bold" font-family="'Helvetica Neue', Helvetica, Arial-Unicode, Arial, Sans-serif">from the Noun Project</text></svg>


--------------------------------------------------------------------------------
/dev/services/google/Detect handwriting in images - Cloud Vision API - Google Cloud.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/Detect handwriting in images - Cloud Vision API - Google Cloud.pdf


--------------------------------------------------------------------------------
/dev/services/google/Detect text in images - Cloud Vision API - Google Cloud.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/Detect text in images - Cloud Vision API - Google Cloud.pdf


--------------------------------------------------------------------------------
/dev/services/google/Document Text Tutorial - Cloud Vision API - Google Cloud.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/Document Text Tutorial - Cloud Vision API - Google Cloud.pdf


--------------------------------------------------------------------------------
/dev/services/google/Google-vision-notes.md:
--------------------------------------------------------------------------------
 1 | # Notes about the Google Vision text recognition API
 2 | 
 3 | These notes correspond to v1 and v1p4beta1. I obtained the Python package from PyPI. These notes correspond to [version 2.3.1](https://pypi.org/project/google-cloud-vision/2.3.1/), released on 2021-04-13.
 4 | 
 5 | ## Calling the service
 6 | 
 7 | The basic API is obtained by importing `google.cloud.vision_v1`.  For example,
 8 | 
 9 | ```python
10 | from google.cloud import vision_v1 as gv
11 | ```
12 | 
13 | To invoke image recognition, you need a client object of type `ImageAnnotatorClient` and an `ImageContext` object. To set certain parameters, you also need a `TextDetectionParams` object. In principle, the parameters available are described in the [documentation for `TextDetectionParams`](https://cloud.google.com/vision/docs/reference/rpc/google.cloud.vision.v1#google.cloud.vision.v1.TextDetectionParams); however, as of 2021-06-10, **only `enable_text_detection_confidence_score` is supported** by `TextDetectionParams`, which you can verify by looking at the [API documentation for `TextDetectionParam`](https://googleapis.dev/python/vision/latest/vision_v1/types.html?highlight=textdetectionparam#google.cloud.vision_v1.types.TextDetectionParams) as well as the [source code in GitHub](https://googleapis.dev/python/vision/latest/_modules/google/cloud/vision_v1/types/image_annotator.html#TextDetectionParams). `ImageContext` also takes a `language_hints` parameter, which for English handwriting, should be set to `"en-t-i0-handwrit"`. 
14 | 
15 | All together, this leads to the following code to get the client and context objects:
16 | 
17 | ```python
18 | client  = gv.ImageAnnotatorClient()
19 | params  = gv.TextDetectionParams(mapping = { 'enable_text_detection_confidence_score': True })
20 | context = gv.ImageContext(language_hints = ['en-t-i0-handwrit'], text_detection_params = params)
21 | ```
22 | 
23 | If you do not include the parameters for the confidence score, the results come back with _some_ confidence scores, but not as many.
24 | 
25 | Next, you need to create an object containing the image to be uploaded to Google. Assuming the raw bytes of a PNG (or similar) image are stored in the variable `image`, you can do this as follows:
26 | 
27 | ```python
28 | img = gv.Image(content = image)
29 | ```
30 | 
31 | And now you can invoke the text recognition service on the image. There are two flavors: `TEXT_DETECTION` and `DOCUMENT_TEXT_DETECTION`. Both are used for OCR, but as described in the [Google Vision API documentation](https://cloud.google.com/vision/docs/ocr#optical_character_recognition_ocr), the `DOCUMENT_TEXT_DETECTION` service is "optimized for dense text and documents", and thus presumably more suited to handling scanned documents. (In my testing on pretty easy text pages bears this out; the results from `TEXT_DETECTION` were worse.)
32 | 
33 | ```python
34 | response = client.document_text_detection(image = img, image_context = context)
35 | ```
36 | 
37 | More information about this service can be found in the section of Google's docs titled [Detect handwriting in images](https://cloud.google.com/vision/docs/handwriting), and in particular the section on [specifying the language](https://cloud.google.com/vision/docs/handwriting#specify_the_language_optional), which has a box explaining how the language hint works.
38 | 
39 | To help figure out how to parse the results, the [Document Text Tutorial](https://cloud.google.com/vision/docs/fulltext-annotations) is worth reading. In the end, I came up with code by inspecting the results and figuring out what the different parts were. See in particular the [sample code for `doctext.py`](https://github.com/googleapis/python-vision/blob/HEAD/samples/snippets/document_text/doctext.py).
40 | 
41 | To get the full text of a page, there are three approaches possible:
42 | * Access `full_text_annotation.text` from the `response` object. 
43 | * Access `text_annotation.description` from the `response` object. In the limited tests I've done on this, the contents were always the same as `full_text_annotation.text`.
44 | * Traverse the hierarchy of objects returned in the list of `full_text_annotation.pages` objects. These will be blocks, paragraphs, words, and symbols. The symbol objects contain in addition indications of breaks in the text. In my testing, assembling words into lines (using the break indicators) produced the same results as the full text annotation text.
45 | 


--------------------------------------------------------------------------------
/dev/services/google/Package google.cloud.vision.v1p4beta1 - Cloud Vision API.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/Package google.cloud.vision.v1p4beta1 - Cloud Vision API.pdf


--------------------------------------------------------------------------------
/dev/services/google/README.md:
--------------------------------------------------------------------------------
1 | Information about the Google Cloud Vision API
2 | =============================================
3 | 
4 | This directory contains some saved materials providing documentation about the Google Cloud Vision API.  These are local copies made in case the web pages change in the future.  Unless otherwise noted, the documentation is content obtained from Google is licensed under the Creative Commons Attribution 3.0 License.  More information about Google developers' documentation can be found in the developers.google.com [Site Policies](https://developers.google.com/terms/site-policies) document.
5 | 


--------------------------------------------------------------------------------
/dev/services/google/Types for Google Cloud Vision v1 API - google-cloud-vision documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/Types for Google Cloud Vision v1 API - google-cloud-vision documentation.pdf


--------------------------------------------------------------------------------
/dev/services/google/google cloud vision api feature types 2018-10-25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/google cloud vision api feature types 2018-10-25.pdf


--------------------------------------------------------------------------------
/dev/services/google/google.cloud.vision_v1.types.image_annotator - google-cloud-vision documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/google/google.cloud.vision_v1.types.image_annotator - google-cloud-vision documentation.pdf


--------------------------------------------------------------------------------
/dev/services/sample-output/README.md:
--------------------------------------------------------------------------------
1 | # Sample output
2 | 
3 | This directory contains output all four services currently supported, for a sample page from one of the documents in the Caltech Archives' [Donald A. Glaser Digital Collection](http://glaser.library.caltech.edu). The following is the command used on 2021-06-09:
4 | 
5 | ```
6 | handprint -e dev/services/sample-output/dag-304-DAG_1_1_8_0029.jp2
7 | ```
8 | 


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-all.png


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.png


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-rekognition.txt:
--------------------------------------------------------------------------------
 1 | is the experimental approach though some theoretical
 2 | issues ave probably still cloudy.
 3 | In biology knowledge is weaker still, but I
 4 | must nevertheless outline my present views.
 5 | my 7 June 1962
 6 | Living things influence
 7 | experience
 8 | of
 9 | Enormous complexity organi is one of the
10 | m main features of that set living things apart from inert ones.
11 | As objects scientific interest one is curious to know how
12 | living things originated, how top developed into their present
13 | forms, and how they work. An attractive working hypothesis
14 | is that they are only complex systems of physical compon
15 | governed by the laws of physics (including chemistry
16 | 


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-textract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-textract.png


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-amazon-textract.txt:
--------------------------------------------------------------------------------
 1 | is the experimental approach, though some theoretical
 2 | issues ave probably still cloudy.
 3 | In biology my knowledge is weaker still, but I
 4 | must nevertheless outline my present views.
 5 | 7 June 1962 29
 6 | Living things constitute a main influence on human
 7 | experience and represent systems of enormous complexity
 8 | Enormous complexity of organization is one of the
 9 | main features that set living things apart from inert ones.
10 | As objects of scientific interest one is curious to
11 | Know
12 | how
13 | living things originated how dep developed into their present
14 | forms, and how they work. An attractive working hypothesis
15 | is that they are only complex systems of physical
16 | components
17 | governed by the laws of physics (including chemistry ), nothing
18 | more and nothing less. On the molecular level this view has
19 | had great successes recently in molecular genetics So far there
20 | has appeared no obstacle contradicting this hypothesis, It must
21 | be admitted, however, that there are not wany detailed
22 | predictions of physics in biology that could furnish a
23 | sensitive test, some people feel that when such tests
24 | are
25 | made, biology will be found to require some new principle
26 | not already present in physics, though not contradicting
27 | physics. In the meantime one goes ontostady the properties
28 | of living things discovering one marvelous mechanism after
29 | another in the complex mechanism that maintains the
30 | delicately ba lauced conditions for healthy life.
31 | What are the questions that seem the most pivotal to
32 | me?
33 | 1) How did the first living thing appear on earth?
34 | 2) Are there living things elsewhere in the universe?
35 | 3) Is evolution a selection among undirected mutations?
36 | 4) Are favorable mutations propagated only within a species,
37 | or can one species "learn" genetic tricks from another
38 | by analoques of bacterial transformation , transduction, and
39 | conjugation? On even by cannibilization or eating totherspacies?
40 | 5) Are there any cases in which learned or otherwise
41 | a required characteristics become genetically buitt-in?
42 | 6) Do the Known processes account quantitatively for the
43 | rate at which evolution has proceeded?
44 | 


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-google.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-google.png


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-google.txt:
--------------------------------------------------------------------------------
 1 | issues
 2 | 2?
 3 | more
 4 | are
 5 | is the experimental approach, though some theoretical
 6 | are probably still cloudy.
 7 | In biology my knowledge is weaker still, but I
 8 | must never the less outline my present views.
 9 | 7 June 1962
10 | Living things constitute a main influence
11 | on human
12 | experience and represent systems of enormous complexity
13 | Enormous complexity of organization is one of the
14 | main features that set living things apart from inert ones.
15 | As objects of scientific interest one is curious to know how
16 | living things originated, how dep developed into their present
17 | forms, and how they work. An attractive working hypothesis
18 | is that they only complex systems of physical components
19 | governed by the laws of physics (including chemistry ), nothing
20 | and nothing less. On the molecular level this view ha's
21 | had great successes recently in molecular genetics. So far there
22 | has appeared no obstacle contradicting this hypothesis, It must
23 | be admi Hed, however, that there not
24 | wany
25 | detailed
26 | predictions of physics in biology that could
27 | furnish a
28 | sensitive test; some people feel that when such
29 | tests are
30 | made, biology will be found to require some new principle
31 | not already present in physics, though not contradicting
32 | physics. In the meantime one goes on to study the properties
33 | of living things discovering one marvelous mechanism after
34 | another in the complex mechanism that maintains the
35 | delicately balauced 'conditions for healthy, life,
36 | What are the questions that seem the most pivotal to
37 | me ?
38 | 1. How did the first living thing appear on earth?
39 | z) Are there living things elsewhere in the suiverse ?
40 | 3) Is evolution á selection among on directed mutations?
41 | 4) Are favorable mutations propagated only within a species,
42 | species "learn genetic tricks from another
43 | by analogues of bacterial transformation, transduction, and
44 | conjugation? Or even by cannibilization or eating other species.?
45 | 5) Are there
46 | any
47 | cases in which learned or otherwise
48 | acquired characteristics become genetically built-in?
49 | 6) Do the known
50 | account quan
51 | antitatively for the
52 | rate at which evolution nas proceded?
53 | or
54 | can
55 | cue
56 | processes
57 | 


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-microsoft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-microsoft.png


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.handprint-microsoft.txt:
--------------------------------------------------------------------------------
 1 | is the experimental approach, though some theoretical
 2 | are probably still cloudy .
 3 | issues
 4 | In biology my knowledge is weaker still, but I
 5 | must nevertheless outline
 6 | my present views .
 7 | 7 June 1962 2P
 8 | Living things constitute a main influence on human
 9 | represent systems of enormous complexity.
10 | expericate and
11 | complexity of organization is one of the
12 | Enormous
13 | features that set living things apart from inent ones.
14 | main
15 | As objects of scientific interest one is curious to know how
16 | living things originated , how dep developed into their present
17 | forms, and how they work. An attractive working hypothesis
18 | is that they are only complex systems of physical components
19 | by the laws of physics ( including chemistry ) , nothing
20 | governed
21 | more and nothing less . On the molecular level this view ha's
22 | had great successes recently in molecular genetics. So far there
23 | has appeared no obstacle contradicting this hypothesis . It must
24 | be admitted , however , that there are not many detailed
25 | predictions of physics in biology that could furnish a
26 | sensitive test ; some people feel that when such tests are
27 | made, biology will be found to require some new principle.
28 | not already present in physics, though not contradicting
29 | physics . In the meantime one goes ontostudy the properties
30 | living things discovering one marvelous mechanism after
31 | of
32 | another in the complex mechanism that maintains the
33 | delicately balanced conditions for healthy life.
34 | the most pivotal to
35 | What are the questions that seem
36 | me ?
37 | 1 ) How did the first living thing appear on earth?
38 | 2 ) Are there living things elsewhere in the universe ?
39 | 3 ) Is evolution a selection among undirected mutations ?
40 | 4) Are favorable mutations propagated only within a species,
41 | or can one species "learn" genetic tricks from another
42 | by analogues of bacterial transformation, transduction, and
43 | conjugation? Or even by cannibilization or eating other species?
44 | 5 ) Are there any cases in which learned or otherwise
45 | acquired characteristics become genetically built-in?
46 | 6 ) Do the known processes account quantitatively for the
47 | rate at which evolution was proceeded ?


--------------------------------------------------------------------------------
/dev/services/sample-output/dag-304-DAG_1_1_8_0029.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/dev/services/sample-output/dag-304-DAG_1_1_8_0029.jp2


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # @file    Makefile
 3 | # @brief   Makefile for building docs using Sphinx and MyST
 4 | # @created 2021-01-25
 5 | # @license Please see the file named LICENSE in the project directory
 6 | # @website https://github.com/caltechlibrary/handprint
 7 | # =============================================================================
 8 | 
 9 | # Before we go any further, test if certain programs are available.
10 | # The following is based on the approach posted by Jonathan Ben-Avraham to
11 | # Stack Overflow in 2014 at https://stackoverflow.com/a/25668869
12 | 
13 | PROGRAMS_NEEDED = sphinx-build
14 | TEST := $(foreach p,$(PROGRAMS_NEEDED),\
15 | 	  $(if $(shell which $(p)),_,$(error Cannot find program "$(p)")))
16 | 
17 | 
18 | # Gather values that we need ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
19 | 
20 | # You can set the following variables from the command line, and also from
21 | # the environment for the first two.
22 | 
23 | SPHINXOPTS  ?=
24 | SPHINXBUILD ?= sphinx-build
25 | SPHINXAUTO  = sphinx-autobuild
26 | SOURCEDIR   = .
27 | BUILDDIR    = _build
28 | 
29 | # Actions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
30 | 
31 | # Put it first so that "make" without argument is like "make help".
32 | help:
33 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
34 | 
35 | auto autobuild live livehtml:
36 | 	@$(SPHINXAUTO) "$(SOURCEDIR)" "$(BUILDDIR)"/html $(SPHINXOPTS) $(O)
37 | 
38 | # Catch-all target: route all unknown targets to Sphinx using the new
39 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
40 | %: Makefile
41 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
42 | 
43 | 
44 | # Cleanup and miscellaneous directives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
45 | 
46 | .PHONY: help auto autobuild livehtml Makefile
47 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # About the Handprint docs
 2 | 
 3 | This page describes how to (re)create the formatted Handprint documentation.
 4 | 
 5 | ## Building the docs locally
 6 | 
 7 | First, install [MyST](https://myst-parser.readthedocs.io/en/latest/index.html) and [Sphinx](https://www.sphinx-doc.org):
 8 | 
 9 | ```sh
10 | python3 -m pip install "myst-parser[linkify]"
11 | python3 -m pip install sphinx-material
12 | python3 -m pip install sphinx-autobuild
13 | ```
14 | 
15 | After that, rebuilding the docs is simply a matter of running `make html` in the current directory:
16 | 
17 | ```sh
18 | make html
19 | ```
20 | 
21 | The output will be put in [`_build/html`](_build/html).  Instead of running `make` deliberately, you can also get auto-rebuilds and live preview using `sphinx-autobuild` by running the following command in the current directory (preferably in a new terminal window, because it will generate continuous output):
22 | 
23 | ```sh
24 | make auto
25 | ```
26 | 
27 | 
28 | ## Writing documentation
29 | 
30 | This documentation is written in [MyST-flavored Markdown](https://myst-parser.readthedocs.io/en/latest/) and the [Napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) extension to Sphinx.  What this means is that the documentation is written in Markdown instead of reStructuredText, with essentially all the features of Sphinx and reStructuredText having MyST equivalents and some additional features beyond _that_ &ndash; things like [pandoc](https://pandoc.org)-style footnotes, LaTeX math, and more.
31 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
  1 | @import url('https://fonts.googleapis.com/css2?family=Atkinson+Hyperlegible:ital,wght@0,400;0,700;1,400;1,700&display=swap');
  2 | 
  3 | body, input {
  4 |     color: rgba(0,0,0,.87);
  5 |     -webkit-font-feature-settings: "kern","liga";
  6 |     font-feature-settings: "kern","liga";
  7 |     font-family: "Atkinson Hyperlegible", "Roboto", "Helvetica Neue", Helvetica, Arial, sans-serif;
  8 | }
  9 | 
 10 | .md-typeset {
 11 |   line-height: 1.5;
 12 |   font-size: 0.9rem;
 13 | }
 14 | 
 15 | .md-typeset p, .md-typeset ol {
 16 |     margin-top: 0.75em;;
 17 |     margin-bottom: 0.75em;;
 18 | }
 19 | 
 20 | .md-typeset ol li,.md-typeset ul li{
 21 |     margin-bottom: .35em;
 22 | }
 23 | 
 24 | .md-typeset ol li ol,.md-typeset ol li ul,.md-typeset ul li ol,.md-typeset ul li ul{
 25 |     margin:.35em 0 .35em .625em
 26 | }
 27 | 
 28 | .md-typeset ol li blockquote,.md-typeset ol li p,.md-typeset ul li blockquote,.md-typeset ul li p{
 29 |     margin: .35em 0 !important;
 30 | }
 31 | 
 32 | .md-sidebar--secondary {
 33 |   display: none !important;
 34 | }
 35 | 
 36 | .md-content {
 37 |   margin-right: 0;
 38 | }
 39 | 
 40 | .md-header-nav__source {
 41 |   width: 8rem !important;
 42 | }
 43 | 
 44 | .md-typeset h1{
 45 |     margin: 0 0 1.5rem;
 46 | }
 47 | 
 48 | .md-typeset h2 {
 49 |     margin: 1.5rem 0 .8rem;
 50 | }
 51 | 
 52 | .md-typeset h3{
 53 |     margin: 1.35rem 0 .8rem;
 54 | }
 55 | 
 56 | .md-typeset table:not([class]) {
 57 |   font-size: 0.8rem;
 58 | }
 59 | 
 60 | .md-typeset__scrollwrap {
 61 |   text-align: center;
 62 | }
 63 | 
 64 | nav.md-tabs {
 65 |   display: none;
 66 | }
 67 | 
 68 | .md-nav__list ul {
 69 |   margin-left: 10px !important;
 70 | }
 71 | 
 72 | figure {
 73 |   margin-top: 1.5em;
 74 |   text-align: center;
 75 | }
 76 | 
 77 | figure img {
 78 |   max-width: 75% !important;
 79 | }
 80 | 
 81 | figure figcaption {
 82 |   font-style: italic;
 83 | }
 84 | 
 85 | dt {
 86 |   font-weight: bold;
 87 | }
 88 | 
 89 | .button {
 90 |   -moz-border-radius: 2px;
 91 |   -webkit-border-radius: 2px;
 92 |   border-radius: 2px;
 93 |   display: inline-block;
 94 |   font-size: 90%;
 95 |   line-height: 1.2;
 96 |   margin: 0 .1em;
 97 |   padding: .15em .3em;
 98 | }
 99 | 
100 | .color-info {
101 |   background-color: #20a1b6;
102 |   color: #fff;
103 | }
104 | 
105 | .color-danger {
106 |   background-color: #db3a4a;
107 |   color: #fff;
108 | }
109 | 
110 | .color-primary {
111 |   background-color: #357bf6;
112 |   color: #fff;
113 | }
114 | 
115 | .color-secondary {
116 |   background-color: #6c6c6c;
117 |   color: #fff;
118 | }
119 | 
120 | .color-not-available {
121 |   background-color: #a6a6a6;
122 |   color: #fff;
123 | }
124 | 
125 | .color-outline-info {
126 |   border: 1px solid #20a1b6;
127 |   border-radius: 3px;
128 |   color: #20a1b6;
129 | }
130 | 
131 | .color-outline-danger {
132 |   border: 1px solid #db3a4a;
133 |   border-radius: 3px;
134 |   color: #db3a4a;
135 | }
136 | 
137 | .color-outline-primary {
138 |   border: 1px solid #357bf6;
139 |   border-radius: 3px;
140 |   color: #357bf6;
141 | }
142 | 
143 | .color-outline-secondary {
144 |   border: 1px solid #6c6c6c;
145 |   border-radius: 3px;
146 |   color: #6c6c6c;
147 | }
148 | 
149 | .color-outline-not-available {
150 |   border: 1px solid #a6a6a6;
151 |   border-radius: 3px;
152 |   color: #a6a6a6;
153 | }
154 | 
155 | h1 {
156 |   font-weight: 500 !important;
157 |   color: #000 !important;
158 | }
159 | 
160 | h2 {
161 |   font-weight: 500 !important;
162 |   color: #000;
163 | }
164 | 
165 | h3 {
166 |   color: #000;
167 | }
168 | 
169 | h4 {
170 |   font-style: italic;
171 | }
172 | 
173 | img.shadowed {
174 |   box-shadow: inset 0 1px 0 rgba(255, 255, 255, .4), 0 22px 50px 4px rgba(0, 0, 0, 0.4), 0 0 0 1px rgba(0, 0, 0, 0.0);
175 | }
176 | 


--------------------------------------------------------------------------------
/docs/_static/media/all-results-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/_static/media/all-results-example.png


--------------------------------------------------------------------------------
/docs/_static/media/clara-barton-page.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/_static/media/clara-barton-page.jpg


--------------------------------------------------------------------------------
/docs/_static/media/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/_static/media/favicon.ico


--------------------------------------------------------------------------------
/docs/_static/media/glaser-example-google.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/_static/media/glaser-example-google.jpg


--------------------------------------------------------------------------------
/docs/_static/media/handprint-icon-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/docs/_static/media/handprint-icon-white.png


--------------------------------------------------------------------------------
/docs/_static/media/handprint-icon-white.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <svg viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px" xmlns="http://www.w3.org/2000/svg">
 3 |   <title>Shape</title>
 4 |   <desc>Created with Sketch.</desc>
 5 |   <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 6 |     <g transform="translate(-926.000000, -136.000000)" fill="#000000">
 7 |       <path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z" style="fill: rgb(255, 255, 255);"/>
 8 |     </g>
 9 |   </g>
10 | </svg>


--------------------------------------------------------------------------------
/docs/_static/media/handprint-icon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 85 113.75" version="1.1" x="0px" y="0px"><title>Shape</title><desc>Created with Sketch.</desc><g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g transform="translate(-926.000000, -136.000000)" fill="#000000"><path d="M999.441632,151.463774 C1000.84163,152.833774 1000.77163,155.243774 1000.44163,157.273774 C999.111632,164.993774 995.441632,171.963774 991.921632,178.763774 L996.071632,174.313774 C999.161632,170.993774 1002.51163,167.513774 1006.75163,166.623774 C1008.87163,166.173774 1010.75163,168.623774 1010.84163,171.023774 C1010.93163,173.423774 1009.76163,175.613774 1008.62163,177.643774 C1001.25442,190.784992 992.939051,203.371679 983.741632,215.303774 C981.953714,217.80073 979.872115,220.073702 977.541632,222.073774 C974.683751,224.377881 971.200171,225.771314 967.541632,226.073774 C966.321632,226.163774 966.021632,223.953774 966.941632,223.073774 C967.861632,222.193774 969.211632,222.173774 970.421632,221.943774 C974.361632,221.193774 977.351632,217.723774 979.971632,214.393774 C989.874527,202.090504 998.549228,188.84735 1005.87163,174.853774 C1006.44239,173.960683 1006.72191,172.912473 1006.67163,171.853774 C1006.50163,170.793774 1005.30163,169.913774 1004.52163,170.563774 C998.31578,175.680276 992.802745,181.583257 988.121632,188.123774 C987.535005,189.108676 986.731247,189.946637 985.771632,190.573774 C984.776678,191.165575 983.49299,190.880797 982.841632,189.923774 C982.161632,188.733774 982.841632,187.193774 983.541632,186.003774 C987.034965,179.903774 990.234965,173.613774 993.141632,167.133774 C994.611632,163.853774 996.031632,160.423774 996.141632,156.773774 C996.225494,156.224016 996.088928,155.663378 995.761632,155.213774 C995.091632,154.533774 994.031632,155.213774 993.361632,155.873774 C986.42095,162.866251 980.76772,171.027891 976.661632,179.983774 C975.931632,181.573774 975.151632,183.293774 973.661632,184.093774 C972.171632,184.893774 969.991632,184.043774 969.971632,182.253774 C970.024104,181.792522 970.152503,181.343124 970.351632,180.923774 C974.071632,169.663774 979.471632,158.773774 980.231632,146.823774 C980.351632,145.003774 979.631632,142.453774 978.011632,142.823774 C977.191632,143.013774 976.711632,143.943774 976.351632,144.823774 C971.778115,155.328649 967.808317,166.086167 964.461632,177.043774 C964.162203,178.335071 963.611759,179.554881 962.841632,180.633774 C962.11317,181.700993 960.739961,182.117117 959.541632,181.633774 C957.851632,180.703774 958.021632,178.013774 958.421632,175.973774 C960.402168,165.886207 961.827645,155.697556 962.691632,145.453774 C962.840929,144.530758 962.772441,143.585623 962.491632,142.693774 C962.141632,141.843774 961.231632,141.193774 960.491632,141.533774 C959.863602,141.950285 959.430772,142.603137 959.291632,143.343774 C955.466995,154.899813 952.610475,166.754036 950.751632,178.783774 C950.321632,181.293774 949.971632,183.923774 948.581632,185.923774 C947.191632,187.923774 944.581632,189.243774 942.581632,187.993774 C940.581632,186.743774 940.151632,183.733774 939.731632,181.183774 C938.860936,175.998674 936.852868,171.070712 933.851632,166.753774 C933.332198,165.800688 932.465493,165.084403 931.431632,164.753774 C930.893231,164.680201 930.352699,164.873186 929.982634,165.271106 C929.612568,165.669025 929.459249,166.222118 929.571632,166.753774 C931.709951,173.332901 932.806525,180.205892 932.821632,187.123774 C932.663403,190.836357 932.69678,194.55463 932.921632,198.263774 C933.311632,201.953774 934.491632,205.703774 936.921632,208.263774 C937.388451,208.621152 937.708535,209.136844 937.821632,209.713774 C937.891632,211.103774 935.901632,211.173774 934.821632,210.413774 C932.321632,208.613774 930.901632,205.413774 930.321632,202.173774 C929.893155,198.905726 929.739283,195.60752 929.861632,192.313774 C929.804684,183.893523 928.555617,175.523763 926.151632,167.453774 C925.489078,164.625744 927.20845,161.786781 930.021632,161.063774 C932.601632,160.523774 935.251632,162.003774 937.021632,164.153774 C938.69237,166.452476 939.982201,169.005126 940.841632,171.713774 L945.501632,184.013774 L949.701632,163.463774 C951.081632,156.713774 952.471632,149.923774 955.121632,143.613774 C955.821778,141.669762 956.937742,139.901966 958.391632,138.433774 C959.800664,137.020069 961.915501,136.579805 963.771632,137.313774 C966.971632,138.923774 966.971632,143.853774 966.451632,147.683774 L963.971632,166.333774 L970.971632,147.013774 C971.661725,144.729453 972.722526,142.574067 974.111632,140.633774 C975.581632,138.823774 977.821632,137.573774 979.971632,138.073774 C982.541632,138.663774 984.241632,141.573774 984.551632,144.453774 C984.861632,147.333774 984.091632,150.203774 983.321632,152.973774 L979.391632,167.173774 C983.391632,161.003774 987.611632,154.573774 993.841632,151.433774 C995.651632,150.533774 997.971632,150.013774 999.441632,151.463774 L999.441632,151.463774 Z" style="fill: rgb(255, 110, 30);"/></g></g></svg>


--------------------------------------------------------------------------------
/docs/_static/versions.json:
--------------------------------------------------------------------------------
1 | {"release": "", "development": "devel"}
2 | 


--------------------------------------------------------------------------------
/docs/basic-usage.md:
--------------------------------------------------------------------------------
 1 | # Basic Handprint usage
 2 | 
 3 | If the installation steps described in the previous section proceed successfully, a command-line program named `handprint` should end up installed in a location where software is normally installed on your computer.  Running `handprint` from a terminal shell then should be as simple as running any other shell command on your system:
 4 | 
 5 | ```bash
 6 | handprint -h
 7 | ```
 8 | 
 9 | If that fails for some reason, you should be able to run `handprint` from anywhere using the normal approach for running Python modules:
10 | 
11 | ```bash
12 | python3 -m handprint -h
13 | ```
14 | 
15 | The `-h` option (`/h` on Windows) will make `handprint` display some help information and exit immediately.  To make Handprint do more, you can supply other arguments that instruct `handprint` to process image files (or alternatively, URLs pointing to image files at a network location) and run text recognition algorithms on them, as explained below.
16 | 
17 | 
18 | ## Installing credentials
19 | 
20 | Handprint includes several adapters for working with cloud-based HTR services from Amazon, Google, and Microsoft, but _does not include credentials for using the services_.  Before you can use Handprint, you must supply credentials for accessing the cloud services you want to use. The process for doing this involves using the `-a` option (`/a` on Windows) and is described in detail in the section [Configuration for cloud services](configuration.md).
21 | 
22 | 
23 | ## Input files and URLs
24 | 
25 | After credentials are installed, running Handprint _without_ the `-a` option will invoke one or more services on files, directories of files, or URLs pointing to files. Here is an example of running Handprint on a directory containing images:
26 | ```sh
27 | handprint tests/data/caltech-archives/glaser/
28 | ```
29 | 
30 | Image paths or URLs can be supplied to Handprint in any of the following ways:
31 | * One or more directory paths or one or more image file paths on the local disk, which will be interpreted as images (either individually or in directories) to be processed
32 | * One or more URLs, which will be interpreted as network locations of image files to be processed
33 | * If given the `-f` option (`/f` on Windows), a file containing either image paths or image URLs to be processed
34 | 
35 | For every input given as a URL, Handprint will first download the image found at the URL to a directory indicated by the option `-o` (`/o` on Windows), or the current directory if option `-o` is not used.
36 | 
37 | No matter whether files or URLs, each input item should be a single image of a document page in which text should be recognized.  Handprint reads a number of common formats: JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF.  However, for simplicity and maximum compatibility with all cloud services, Handprint always **converts all input files to PNG** if they are not already in that format, no matter if a given service can accept other formats.  Handprint also **reduces the size** of input images to the smallest size accepted by any of the services invoked if an image exceeds that size.  (For example, when sending a file to services A and B at the same time, if service A accepts files up to 10 MB in size and service B accepts files up to 4 MB, Handprint will resize the file to 4 MB before sending it to _both_ A and B, even if A could accept a higher-resolution image.)  Finally, if the input contains more than one page (e.g., in a PDF file), Handprint will **only use the first page of the input** and ignore the remaining pages.
38 | 
39 | Be aware that **downsizing images can change the text recognition results returned by some services** compared to the results obtained using the original full-size input image.  If your images are larger when converted to PNG than the smallest size accepted by one of the destination services (currently 4 MB, for Microsoft), then you may wish to compare the results of using multiple services at once versus one at a time (i.e., one destination at a time in separate invocations of Handprint).
40 | 
41 | Finally, note that providing URLs on the command line can be problematic due to how terminal shells interpret certain characters, and so when supplying URLs, it's usually better to store the URLs in a file in combination with the `-f` option (`/f` on Windows).
42 | 
43 | 
44 | ## Selecting destination services
45 | 
46 | You can use the `-l` option (`/l` on Windows) to make it display a list of the services currently supported by Handprint:
47 | 
48 | ```sh
49 | # handprint -l
50 | Known services: amazon-rekognition, amazon-textract, google, microsoft
51 | ```
52 | 
53 | By default, Handprint will send images to all of the known services, creating annotated images to represent the results of each individual service.  To invoke only specific services, use the `-s` option (`/s` on Windows) followed by a service name or a list of names separated by commas (e.g., `google,microsoft`).  For example, the following command will invoke Microsoft's text recognition service on a page from [Clara Barton's unpublished draft book "The Life of My Childhood"](https://picryl.com/media/clara-barton-papers-speeches-and-writings-file-1849-1947-books-the-life-of-71), available in Handprint's source directory:
54 | ```sh
55 | handprint -s microsoft tests/data/public-domain/images/clara-barton-life-of-my-childhood-p90.jpg
56 | ```
57 | 
58 | Here is the result of that command:
59 | <p align="center">
60 | <img width="90%" src="_static/media/clara-barton-page.jpg" alt="Example of running Microsoft's service on a page from Clara Barton's unpublished draft book, The Life of My Childhood.">
61 | </p>
62 | 
63 | 
64 | ## Visual display of results
65 | 
66 | After gathering the results of each service for a given input, Handprint will create a single compound image consisting of the results for each service arranged in a _N_&times;_N_ grid.  This overview image is intended to make it easier to compare the results of multiple services against each other.  The grid image will have the suffix `.handprint-all.png`.  Here is a sample output image to illustrate:
67 | 
68 | <p align="center">
69 | <img width="90%" src="_static/media/all-results-example.png" alt="Example annotated results output image">
70 | </p>
71 | 
72 | The 2&times;2 image above was produced by running the following command from the Handprint `tests/data/caltech-archives/glaser` directory:
73 | ```csh
74 | handprint --text-size 20  "DAG_5_1_6 1952-1957 Notebook VI p2.jpg"
75 | ```
76 | 
77 | To move the position of the text annotations overlaid over the input image, you can use the option `-m` (or `/m` on Windows).  This takes two numbers separated by a comma in the form `x,y`.  Positive numbers move the text rightward and upward, respectively, relative to the default position.  The default position of each text annotation in the annotated output is such that the _left edge of the word_ starts at the location of the _upper left corner of the bounding box_ returned by the service; this has the effect of putting the annotation near, but above, the location of the (actual) word in the input image by default.  For example, if the word in the image is _strawberry_, the bounding box returned by the service will enclose _strawberry_, and the upper left corner of that bounding box will be somewhere above the letter _s_. Then, the default position of the text annotation will put the left edge of the word "strawberry" at that point above the letter _s_. Using the move-text option allows you to move the annotation if desired. A value such as `0,-5` will move it downward five pixels.
78 | 
79 | To change the color of the text annotations overlaid over the input image, you can use the option `-x` (or `/x` on Windows).  You can use hex color codes such as `"#ff0000"` (make sure to enclose the value with quotes, or the shell will interpret the pound sign as a comment character), or X11/CSS4 color names with no spaces such as `purple` or `darkgreen`.
80 | 
81 | To change the size of the text annotations overlaid over the input image, you can use the option `-z` (or `/z` on Windows).  The value is in units of points.  The default size is 12.
82 | 
83 | Finally, the individual results, as well as individual annotated images corresponding to the results from each service, will not be retained unless the `-e` extended results option (`/e` on Windows) is invoked (described in more detail below).  The production of the overview grid image can be skipped by using the `-G` option (`/G` on Windows).
84 | 


--------------------------------------------------------------------------------
/docs/colophon.md:
--------------------------------------------------------------------------------
1 | # Colophon
2 | 
3 | This documentation was written by [Michael Hucka](https://www.cds.caltech.edu/~mhucka/) using the [Sphinx](https://www.sphinx-doc.org) document generator together with [MyST-flavored Markdown](https://myst-parser.readthedocs.io/en/latest/). The theme is the [Material theme for Sphinx](https://bashtage.github.io/sphinx-material/), with light customizations such as the use of Google's [Atkinson Hyperlegible](https://fonts.google.com/specimen/Atkinson+Hyperlegible) font. A [GitHub Action](https://github.com/caltechlibrary/foliage/blob/main/.github/workflows/build-sphinx.yml) takes care of creating the formatted version of the documentation and hosting it on GitHub.io at https://caltechlibrary.github.io/foliage. The formatted output can also be produced manually using commands implemented in the `Makefile` located in the [`docs/`](https://github.com/caltechlibrary/foliage/tree/main/docs) subdirectory of the Foliage source code repository.
4 | 
5 | The [vector artwork](https://thenounproject.com/search/?q=hand&i=733265) of a hand used as a logo for Handprint was created by [Kevin](https://thenounproject.com/kevn/) for the [Noun Project](https://thenounproject.com).  It is licensed under the Creative Commons [CC-BY 3.0](https://creativecommons.org/licenses/by/3.0/) license.
6 | 
7 | Unless indicated otherwise, all other artwork in this documentation was created using [OmniGraffle Pro](https://www.omnigroup.com/omnigraffle) on a macOS computer. SVG versions of the diagrams were produced with the help of [svg-buddy](https://github.com/phauer/svg-buddy) to embed fonts into the SVG files and overcome a limitation of OmniGraffle's SVG output.
8 | 


--------------------------------------------------------------------------------
/docs/command-summary.md:
--------------------------------------------------------------------------------
 1 | # Handprint command summary
 2 | 
 3 | ## Command-line options
 4 | 
 5 | The following table summarizes all the command line options available. (Note: on Windows computers, `/` must be used as the prefix character instead of the `-` dash character):
 6 | 
 7 | | Short&nbsp;&nbsp;&nbsp;&nbsp;    | Long&nbsp;form | Meaning | Default |  |
 8 | |----------------------------------|----------------|---------|---------|--|
 9 | | `-a` _A_   | `--add-creds` _A_   | Add credentials for service _A_ and exit | | |
10 | | `-b` _B_   | `--base-name` _B_   | Write outputs to files named _B_-n | Use base names of image files | ⚑ |
11 | | `-C`       | `--no-color`        | Don't color-code info messages | Color-code terminal output |
12 | | `-c`       | `--compare`         | Compare to ground truth; see `-r` too | |
13 | | `-d` _D_   | `--display` _D_     | Display annotation types _D_ | Display text annotations | ★ |
14 | | `-e`       | `--extended`        | Produce extended results | Produce only summary image | |
15 | | `-f` _F_   | `--from-file` _F_   | Read file names or URLs from file _F_ | Use args on the command line |
16 | | `-G`       | `--no-grid`         | Don't create summary image | Create an _N_&times;_N_ grid image| |
17 | | `-h`       | `--help`            | Display help, then exit | | |
18 | | `-j`       | `--reuse-json`      | Reuse prior JSON results if found | Ignore any existing results | | 
19 | | `-l`       | `--list`            | Display known services and exit | | | 
20 | | `-m` _x,y_ | `--text-move` _x,y_ | Move each text annotation by x,y | `0,0` | |
21 | | `-n` _N_   | `--confidence` _N_  | Use confidence score threshold _N_ | `0` | |
22 | | `-o` _O_   | `--output` _O_      | Write all outputs to directory _O_ | Write to images' directories | |
23 | | `-q`       | `--quiet`           | Don't write messages while working | Be chatty while working |
24 | | `-r`       | `--relaxed`         | Use looser criteria for `--compare` | |
25 | | `-s` _S_   | `--service` _S_     | Use recognition service _S_; see `-l` | Use all services | |
26 | | `-t` _T_   | `--threads` _T_     | Use _T_ number of threads | Use (#cores)/2 threads | |
27 | | `-V`       | `--version`         | Write program version info and exit | | |
28 | | `-x` _X_   | `--text-color` _X_  | Use color _X_ for text annotations | Red | |
29 | | `-z` _Z_   | `--text-size` _Z_   | Use font size _Z_ for text annotations | Use font size 12 | |
30 | | `-@` _OUT_ | `--debug` _OUT_     | Write detailed execution info to _OUT_ | Normal mode | ⬥ |
31 | 
32 | ⚑ &nbsp; If URLs are given, then the outputs will be written by default to names of the form `document-n`, where n is an integer.  Examples: `document-1.jpg`, `document-1.handprint-google.txt`, etc.  This is because images located in network content management systems may not have any clear names in their URLs.<br>
33 | ★ &nbsp; The possible values of _D_ are: `text`, `bb`, `bb-word`, `bb-line`, `bb-para`. Multiple values must be separated with commas. The value `bb` is a shorthand for `bb-word,bb-line,bb-para`. The default is `text`.<br>
34 | ⬥ &nbsp; To write to the console, use the character `-` as the value of _OUT_; otherwise, _OUT_ must be the name of a file where the output should be written.
35 | 
36 | 
37 | ## Return values
38 | 
39 | This program exits with a return code of 0 if no problems are encountered.  It returns a nonzero value otherwise. The following table lists the possible return values:
40 | 
41 | | Code | Meaning                                                  |
42 | |:----:|----------------------------------------------------------|
43 | | 0    | success &ndash; program completed normally               |
44 | | 1    | the user interrupted the program's execution             |
45 | | 2    | encountered a bad or missing value for an option         |
46 | | 3    | no network detected &ndash; cannot proceed               |
47 | | 4    | file error &ndash; encountered a problem with a file     |
48 | | 5    | server error &ndash; encountered a problem with a server |
49 | | 6    | an exception or fatal error occurred                     |
50 | 
51 | 
52 | ## Additional notes
53 | 
54 | The debug logging functionality is implemented using [Sidetrack](https://github.com/caltechlibrary/sidetrack) and all calls to the debug code are conditionalized on the Python symbol `__debug__`.  It is carefully written so that you can cause the calls to be _optimized out completely_ if your run Python with [optimization turned on](https://docs.python.org/3/using/cmdline.html#cmdoption-o) (e.g., using the `-O` command-line option).
55 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # =============================================================================
  2 | # @file    conf.py
  3 | # @brief   COnfiguration file for Sphynx + MyST based documentation
  4 | # @created 2021-01-25
  5 | # @license Please see the file named LICENSE in the project directory
  6 | # @website https://github.com/caltechlibrary/handprint
  7 | #
  8 | # This file only contains a selection of the most common options. For a full
  9 | # list, refer to https://www.sphinx-doc.org/en/master/usage/configuration.html
 10 | # =============================================================================
 11 | 
 12 | project = 'Handprint'
 13 | copyright = '2022, Caltech Library'
 14 | author = 'Michael Hucka @ Caltech Library'
 15 | 
 16 | 
 17 | # -- General configuration ----------------------------------------------------
 18 | 
 19 | extensions = [
 20 |     'myst_parser',
 21 |     'sphinx.ext.autodoc',
 22 |     'sphinx.ext.autosectionlabel',
 23 |     'sphinx.ext.napoleon',
 24 |     'sphinxcontrib.mermaid'
 25 | ]
 26 | 
 27 | # Add any paths that contain templates here, relative to this directory.
 28 | templates_path = ['_templates']
 29 | 
 30 | # List of patterns, relative to source directory, that match files and
 31 | # directories to ignore when looking for source files.
 32 | # This pattern also affects html_static_path and html_extra_path.
 33 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.md',
 34 |                     '**/README.md']
 35 | 
 36 | 
 37 | # -- Sphinx options for HTML output -------------------------------------------
 38 | 
 39 | html_title = 'Handprint'
 40 | html_short_title = "Home"
 41 | 
 42 | html_logo = "_static/media/handprint-icon-white.png"
 43 | html_favicon = "_static/media/favicon.ico"
 44 | 
 45 | # The theme to use for HTML and HTML Help pages.
 46 | html_theme = 'sphinx_material'
 47 | 
 48 | # Add any paths that contain custom static files (such as style sheets) here,
 49 | # relative to this directory. They are copied after the builtin static files,
 50 | # so a file named "default.css" will overwrite the builtin "default.css".
 51 | html_static_path = ['_static']
 52 | 
 53 | # Our additional CSS.
 54 | html_css_files   = ['css/custom.css']
 55 | 
 56 | html_show_sourcelink = False
 57 | html_sidebars = {
 58 |     "**": ["globaltoc.html", "searchbox.html"]
 59 | }
 60 | 
 61 | # Show the last updated date in the footer using the default format.
 62 | html_last_updated_fmt = ""
 63 | 
 64 | 
 65 | # -- Options for the Material theme -------------------------------------------
 66 | # C.f. https://github.com/bashtage/sphinx-material/blob/master/docs/conf.py
 67 | 
 68 | # Material theme options (see theme.conf for more information)
 69 | html_theme_options = {
 70 | 
 71 |     # Set the name of the project to appear in the navigation.
 72 |     'nav_title': 'Handprint',
 73 | 
 74 |     # Set you GA account ID to enable tracking
 75 |     'google_analytics_account': '',
 76 | 
 77 |     # Specify a base_url used to generate sitemap.xml. If not
 78 |     # specified, then no sitemap will be built.
 79 |     'base_url': 'https://caltechlibrary.github.io/handprint',
 80 | 
 81 |     # Set the colors. I found a list here:
 82 |     # https://squidfunk.github.io/mkdocs-material/setup/changing-the-colors/
 83 |     "theme_color": 'blue-grey',
 84 |     'color_primary': 'deep-orange',
 85 |     'color_accent': 'teal',
 86 | 
 87 |     # Set the repo location to get a badge with stats
 88 |     'repo_url': 'https://github.com/caltechlibrary/handprint/',
 89 |     'repo_name': 'Handprint',
 90 | 
 91 |     # Visible levels of the global TOC; -1 means unlimited
 92 |     'globaltoc_depth': 2,
 93 |     # If False, expand all TOC entries
 94 |     'globaltoc_collapse': False,
 95 |     # If True, show hidden TOC entries
 96 |     'globaltoc_includehidden': False,
 97 | 
 98 |     "html_minify": False,
 99 |     "html_prettify": False,
100 | 
101 |     "version_dropdown": False,
102 |     "version_json": "_static/versions.json",
103 |     # "version_info": {
104 |     #     "Release": "https://bashtage.github.io/sphinx-material/",
105 |     #     "Development": "https://bashtage.github.io/sphinx-material/devel/",
106 |     #     "Release (rel)": "/sphinx-material/",
107 |     #     "Development (rel)": "/sphinx-material/devel/",
108 |     # },
109 | }
110 | 
111 | 
112 | # -- Options for the MyST parser ----------------------------------------------
113 | 
114 | myst_enable_extensions = [
115 |     "colon_fence",
116 |     "html_image",
117 |     "linkify",
118 |     "smartquotes",
119 |     "substitution"
120 | ]
121 | 


--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
 1 | # Configuration for cloud services
 2 | 
 3 | Handprint includes several adapters for working with cloud-based HTR services from Amazon, Google, and Microsoft, but _does not include credentials for using the services_.  To be able to use Handprint, you must **both** install a copy of Handprint on your computer **and** supply your copy with credentials for accessing the cloud services you want to use.
 4 | 
 5 | Installing credentials involves a one-time configuration step for **each** cloud-based HTR service after you install Handprint on a computer.  In each case, the same command format is used:
 6 | ```sh
 7 | handprint -a SERVICENAME CREDENTIALSFILE.json
 8 | ```
 9 | 
10 | _SERVICENAME_ must be one of the service names printed by running `handprint -l`, and `CREDENTIALSFILE.json` must have one of the formats discussed below.  When you run this command, Handprint copies `CREDENTIALSFILE.json` to a private location, and thereafter uses the credentials to access _SERVICENAME_.  (The private location is different on different systems; for example, on macOS it is `~/Library/Application Support/Handprint/`.)  Examples are given below.
11 | 
12 | 
13 | ## Microsoft
14 | 
15 | Microsoft's approach to credentials in Azure involves the use of [subscription keys](https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/vision-api-how-to-topics/howtosubscribe).  The format of the credentials file for Handprint needs to contain two fields:
16 | 
17 | ```json
18 | {
19 |  "subscription_key": "YOURKEYHERE",
20 |  "endpoint": "https://ENDPOINT"
21 | }
22 | ```
23 | 
24 | The value `"YOURKEYHERE"` will be a string such as `"18de248475134eb49ae4a4e94b93461c"`, and it will be associated with an endpoint URI such as `"https://westus.api.cognitive.microsoft.com"`.  To obtain a key and the corresponding endpoint URI, visit [https://portal.azure.com](https://portal.azure.com) and sign in using your account login.  (Note: you will need to turn off browser security plugins such as Ad&nbsp;Block and uMatrix if you have them, or else the site will not work.)  Once you are authenticated to the Azure portal, you can create credentials for using Azure's machine-learning services.  Some notes all about this can be found in the [Handprint project Wiki pages on GitHub](https://github.com/caltechlibrary/handprint/wiki/Getting-Microsoft-Azure-credentials).
25 | 
26 | Once you have obtained both a key and an endpoint URI, use a text editor to create a JSON file in the simple format shown above, save that file somewhere on your computer (for the sake of this example, assume it is `myazurecredentials.json`), and use the command discussed above to make Handprint copy the credentials file:
27 | ```sh
28 | handprint -a microsoft myazurecredentials.json
29 | ```
30 | 
31 | 
32 | ## Google
33 | 
34 | Credentials for using a Google service account need to be stored in a JSON file that contains many fields.  The overall format looks like this:
35 | 
36 | ```json
37 | {
38 |   "type": "service_account",
39 |   "project_id": "theid",
40 |   "private_key_id": "thekey",
41 |   "private_key": "-----BEGIN PRIVATE KEY-----anotherkey-----END PRIVATE KEY-----\n",
42 |   "client_email": "emailaddress",
43 |   "client_id": "id",
44 |   "auth_uri": "https://accounts.google.com/o/oauth2/auth",
45 |   "token_uri": "https://oauth2.googleapis.com/token",
46 |   "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
47 |   "client_x509_cert_url": "someurl"
48 | }
49 | ```
50 | 
51 | The process for getting one of these is summarized in the Google Cloud docs for [Creating a service account](https://cloud.google.com/docs/authentication/), but more explicit instructions can be found in the [Handprint project Wiki pages on GitHub](https://github.com/caltechlibrary/handprint/wiki/Getting-Google-Cloud-credentials).  Once you have downloaded a Google credentials file from Google, save the file somewhere on your computer (for the sake of this example, assume it is `mygooglecredentials.json`), and use the command discussed above to make Handprint copy the credentials file:
52 | ```sh
53 | handprint -a google mygooglecredentials.json
54 | ```
55 | 
56 | 
57 | ## Amazon
58 | 
59 | Amazon credentials for AWS take the form of two alphanumeric strings: a _key id_ string and a _secret access key_ string.  In addition, the service needs to be invoked with a region identifier.  For the purposes of Handprint, these should be stored in a JSON file with the following format:
60 | 
61 | ```json
62 | {
63 |     "aws_access_key_id": "YOUR_KEY_ID_HERE",
64 |     "aws_secret_access_key": "YOUR_ACCESS_KEY_HERE",
65 |     "region_name": "YOUR_REGION_NAME_HERE"
66 | }
67 | ```
68 | 
69 | Getting this information is, thankfully, a relatively simple process for Amazon's services. Instructions can be found in the [Handprint project Wiki pages on GitHub](https://github.com/caltechlibrary/handprint/wiki/Creating-credentials-for-use-with-Amazon-Rekognition).  Once you have obtained the two alphanumeric keys and a region identifier string, use a text editor to create a JSON file in the simple format shown above, save that file somewhere on your computer (for the sake of this example, assume it is `myamazoncredentials.json`), and use _two_ commands to make Handprint copy the credentials file for the two different Amazon services currently supported by Handprint:
70 | ```sh
71 | handprint -a amazon-textract myamazoncredentials.json
72 | handprint -a amazon-rekognition myamazoncredentials.json
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Handprint<img align="right" width="400px" src="_static/media/glaser-example-google.jpg">
 2 | 
 3 | The _**Hand**written **p**age **r**ecognit**i**o**n** **t**est_ is a command-line program that invokes HTR (handwritten text recognition) services on images of document pages.  It can produce annotated images showing the results, compare the recognized text to expected text, save the HTR service results as JSON and text files, and more. It can be applied to any images of text documents. The images can come from your local computer's disk or from web locations.
 4 | 
 5 | Handprint currently supports Google's [Google Cloud Vision API](https://cloud.google.com/vision/docs/ocr), Microsoft's Azure [Computer Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/), and Amazon's [Textract](https://aws.amazon.com/textract/) and [Rekognition](https://aws.amazon.com/rekognition/). (To use the services, you need to provide suitable account credentials.)
 6 | 
 7 | Handprint is written in Python. Ready-to-run executables are available for some operating systems; you can also install Handprint using common Python installation frameworks such as `pipx` and `pip`.
 8 | 
 9 | ## Sections
10 | 
11 | ```{toctree}
12 | ---
13 | maxdepth: 2
14 | ---
15 | installation.md
16 | configuration.md
17 | basic-usage.md
18 | advanced-usage.md
19 | command-summary.md
20 | known-issues.md
21 | colophon.md
22 | ```
23 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | This page describes how you can install Handprint on your computer. After installation, you will also need to perform some [configuration steps described elsewhere](configuration.md).
 4 | 
 5 | 
 6 | ## Preliminary requirements
 7 | 
 8 | Handprint is written in [Python 3](https://www.python.org) and makes use of some additional Python software libraries that are installed automatically during the [installation steps](#installation). It also assumes a macOS, Windows or Linux environment, and a working Internet connection.
 9 | 
10 | The instructions below assume you have a Python interpreter version 3.8 or higher installed on your computer; if that's not the case, please first install Python and familiarize yourself with running Python programs on your system. If you are unsure of which version of Python you have, you can find out by running the following command in a terminal and inspecting the results:
11 | ```sh
12 | # Note: on Windows, you may have to use "python" instead of "python3"
13 | python3 --version
14 | ```
15 | 
16 | Note for Mac users: if you are using macOS Catalina (10.15) or later and have never run `python3`, then the first time you do, macOS will ask you if you want to install the macOS command-line developer tools.  Go ahead and do so, as this is the easiest way to get a recent-enough Python&nbsp;3 on those systems.
17 | 
18 | 
19 | ## Installation instructions
20 | 
21 | There are multiple ways of installing Handprint, ranging from downloading a self-contained, single-file, ready-to-run program, to installing it as a typical Python program using `pip`.  Please choose the alternative that suits you.
22 | 
23 | ### Approach 1: using the standalone Handprint executables
24 | 
25 | Beginning with version 1.5.1, runnable self-contained single-file executables are available for select operating system and Python version combinations &ndash; to use them, you **only** need a Python&nbsp;3 interpreter and a copy of Handprint, but **do not** need to run `pip install` or other steps.
26 | 
27 | #### MacOS
28 | 
29 | Visit the [Handprint releases page](https://github.com/caltechlibrary/handprint/releases) and look for the ZIP files with names such as (e.g.) `handprint-1.5.4-macos-python3.8.zip`. Then:
30 | 1. Download the one matching your version of Python
31 | 2. Unzip the file (if your browser did not automatically unzip it for you)
32 | 3. Open the folder thus created (it will have a name like `handprint-1.5.4-macos-python3.8`)
33 | 4. Look inside for `handprint` and move it to a location where you put other command-line programs (e.g., `/usr/local/bin`)
34 | 
35 | #### Linux
36 | 
37 | Visit the [Handprint releases page](https://github.com/caltechlibrary/handprint/releases) and look for the ZIP files with names such as (e.g.) `handprint-1.5.4-linux-python3.8.zip`. Then:
38 | 1. Download the one matching your version of Python
39 | 2. Unzip the file (if your browser did not automatically unzip it for you)
40 | 3. Open the folder thus created (it will have a name like `handprint-1.5.4-linux-python3.8`)
41 | 4. Look inside for `handprint` and move it to a location where you put other command-line programs (e.g., `/usr/local/bin`)
42 | 
43 | #### Windows
44 | 
45 | Standalone executables for Windows are not available at this time. If you are running Windows, please use one of the other methods described below.
46 | 
47 | 
48 | ### Approach 2: using `pipx`
49 | 
50 | You can use [pipx](https://pypa.github.io/pipx/) to install Handprint. Pipx will install it into a separate Python environment that isolates the dependencies needed by Handprint from other Python programs on your system, and yet the resulting `handprint` command wil be executable from any shell &ndash; like any normal application on your computer. If you do not already have `pipx` on your system, it can be installed in a variety of easy ways and it is best to consult [Pipx's installation guide](https://pypa.github.io/pipx/installation/) for instructions. Once you have pipx on your system, you can install Handprint with the following command:
51 | ```sh
52 | pipx install handprint
53 | ```
54 | 
55 | Pipx can also let you run Handprint directly using `pipx run handprint`, although in that case, you must always prefix every Handprint command with `pipx run`.  Consult the [documentation for `pipx run`](https://github.com/pypa/pipx#walkthrough-running-an-application-in-a-temporary-virtual-environment) for more information.
56 | 
57 | 
58 | ### Approach 3: using `pip`
59 | 
60 | If you prefer, you can install Handprint with [pip](https://pip.pypa.io/en/stable/installing/).  If you don't have `pip` package or are uncertain if you do, please consult the [pip installation instructions](https://pip.pypa.io/en/stable/installation/). Then, to install or upgrade Handprint from the Python package repository, run the following command:
61 | ```sh
62 | python3 -m pip install handprint --upgrade
63 | ```
64 | 


--------------------------------------------------------------------------------
/docs/known-issues.md:
--------------------------------------------------------------------------------
1 | # Known issues and limitations
2 | 
3 | Here are some known limitations in the current version of Handprint:
4 | * If the input has multiple pages, only the first page/image is used; the rest (if any) are ignored.
5 | * The Amazon Rekognition API will return [at most 50 words in an image](https://docs.aws.amazon.com/rekognition/latest/dg/limits.html).
6 | * The Microsoft Azure API will only detect a maximum of [300 lines of text per page](https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/concept-recognizing-text).
7 | * Some services have different file size restrictions depending on the format of the file, but Handprint always uses the same limit for all files for a given service.  This is a code simplification.
8 | 


--------------------------------------------------------------------------------
/handprint/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Handprint: experiment with handwritten text recognition on Caltech Archives.
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | # Package metadata ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | #
19 | #  ╭────────────────────── Notice ── Notice ── Notice ─────────────────────╮
20 | #  |    The following values are automatically updated at every release    |
21 | #  |    by the Makefile. Manual changes to these values will be lost.      |
22 | #  ╰────────────────────── Notice ── Notice ── Notice ─────────────────────╯
23 | 
24 | __version__     = '1.6.0'
25 | __description__ = 'Run handwritten text recognition services on images of documents'
26 | __url__         = 'https://github.com/caltechlibrary/handprint'
27 | __author__      = 'Michael Hucka'
28 | __email__       = 'mhucka@caltech.edu'
29 | __license__     = 'BSD 3-clause'
30 | 
31 | 
32 | # Miscellaneous utilities.
33 | # .............................................................................
34 | 
35 | def print_version():
36 |     print(f'{__name__} version {__version__}')
37 |     print(f'Authors: {__author__}')
38 |     print(f'URL: {__url__}')
39 |     print(f'License: {__license__}')
40 | 
41 | 
42 | # Miscellaneous constants.
43 | # .............................................................................
44 | 
45 | # Output format for the files we write.
46 | _OUTPUT_FORMAT = 'png'
47 | _OUTPUT_EXT = '.png'
48 | 


--------------------------------------------------------------------------------
/handprint/comparison.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | comparison.py: compare results to ground truth
  3 | 
  4 | Authors
  5 | -------
  6 | 
  7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
  8 | 
  9 | Copyright
 10 | ---------
 11 | 
 12 | Copyright (c) 2019-2022 by the California Institute of Technology.  This code
 13 | is open-source software released under a 3-clause BSD license.  Please see the
 14 | file "LICENSE" for more information.
 15 | '''
 16 | 
 17 | from collections import namedtuple
 18 | 
 19 | import handprint
 20 | from handprint.exceptions import *
 21 | 
 22 | 
 23 | # Data structures.
 24 | # .............................................................................
 25 | 
 26 | Line = namedtuple('Line', 'number distance cer gt_text htr_text')
 27 | Line.__doc__ = '''Data about one line in the comparison results.
 28 |   'number' is the line number in the HTR text results
 29 |   'distance' is the Levenshtein distance between the HTR text and the g.t.
 30 |   'cer' is the character error for the HTR text line
 31 |   'gt_text' is the ground truth text line
 32 |   'htr_text' is the HTR text line
 33 | '''
 34 | 
 35 | 
 36 | # Constants.
 37 | # .............................................................................
 38 | 
 39 | _SIMILARITY_THRESHOLD = 0.5
 40 | 
 41 | _PUNCTUATION_REMOVER = str.maketrans('', '', '.,:;')
 42 | 
 43 | 
 44 | # Main functions.
 45 | # .............................................................................
 46 | 
 47 | def text_comparison(htr_text, gt_text, relaxed = False):
 48 |     '''Compare the HTR result text in "htr_text" with the expected ground truth
 49 |     text in "gt_text".  Returns a tab-separated table describing the results.
 50 | 
 51 |     This function accounts for the possibility that the HTR results may not
 52 |     contain a line of text for every line of ground truth, and conversely,
 53 |     may also contain lines of text that are not supposed to appear.  The
 54 |     approach uses a novel algorithm to compare the texts line-by-line using
 55 |     longest common subsequence similarity (as implemented by the LCSSEQ
 56 |     function in the Python "textdistance" package), to try to match up
 57 |     corresponding lines in the two texts before calculating Levenshtein
 58 |     distance and CER for each line individually.
 59 |     '''
 60 | 
 61 |     # This works by building up an intermediate data structure that consists
 62 |     # of a list of tuples of type "Line" (a named tuple).  Each has this form:
 63 |     #
 64 |     #    (htr line #,  Levenshtein error,  CER,  gt line text,  htr line text)
 65 |     #
 66 |     # If a line is missing from the htr text (relative to the gt text), its
 67 |     # line number is written as None.  If a line is missing from the gt text
 68 |     # (relative to the htr text), its value (in the "gt line text" column) is
 69 |     # written as ''.  The order of the list of tuples is important; in the
 70 |     # end it represents the entire list of text lines present in either the
 71 |     # gt text or htr text.  Here is an example showing missing lines:
 72 |     #
 73 |     #    htr line #   gt text             htr text
 74 |     #    ----------   -------            --------
 75 |     #        1        ""                  doc 01
 76 |     #        2        April 25, 2019      Avril 25, 2019
 77 |     #        3        My darling,         My darling,
 78 |     #      None       what a wonderful    ""
 79 |     #        4        day today was.      bay today vas.
 80 |     #        5        ""                  rooujjlh
 81 |     #
 82 |     # The final outcome for the above will have 6 lines, even though the ground
 83 |     # truth has 4 lines, in order to describe the fact that the HTR text
 84 |     # contains extra lines text at the beginning and end.  The HTR text is also
 85 |     # missing a line in the middle.
 86 | 
 87 |     # Algorithm:
 88 |     # 1) Go through the gt text lines one at a time in linear order, and
 89 |     #    compare each line to each line of the HTR text using LCSSEQ.  If
 90 |     #    a line in the gt text does not appear in the HTR text (judged by
 91 |     #    the LCSSEQ score not crossing a certain threshold), mark that line
 92 |     #    as missing; otherwise, store the Levenshtein distance and CER scores
 93 |     #    for that line in a tuple.
 94 |     #
 95 |     # 2) Go through the list of tuples and find all lines in the HTR text
 96 |     #    that do not exist in the gt text.
 97 |     #
 98 |     # 3) Go through this list of extra HTR lines and insert tuples in the
 99 |     #    correct locations in the main list of tuples.
100 |     #
101 |     # 4) Go through the list of tuples, add up error scores and other things
102 |     #    and produce the final output string.
103 | 
104 |     # Textdistance takes a long time to load.  Delay loading it until we need
105 |     # it so that the overall application startup times can be faster.
106 |     from textdistance import lcsseq
107 |     # Shorten this name for easier reading in the code below.
108 |     lcsseq_score = lcsseq.normalized_similarity
109 | 
110 |     gt_lines  = gt_text.strip().splitlines()
111 |     htr_lines = htr_text.strip().splitlines()
112 |     htr_index = 0
113 |     results   = []
114 | 
115 |     if relaxed:
116 |         gt_lines  = [text.lower() for text in gt_lines]
117 |         gt_lines  = [text.translate(_PUNCTUATION_REMOVER) for text in gt_lines]
118 |         htr_lines = [text.lower() for text in htr_lines]
119 |         htr_lines = [text.translate(_PUNCTUATION_REMOVER) for text in htr_lines]
120 | 
121 |     for gt_line in gt_lines:
122 |         htr_line = htr_lines[htr_index]
123 |         if lcsseq_score(gt_line, htr_line) >= _SIMILARITY_THRESHOLD:
124 |             results.append(line_data(gt_line, htr_line, htr_index))
125 |             htr_index += 1
126 |         else:
127 |             # LCSSEQ score too low => lines don't correspond.  Also means the
128 |             # line in the HTR text is something not found in the gt text.
129 |             # Check if any line later in the HTR text matches any better.
130 |             for other_index, other_line in enumerate(htr_lines[htr_index + 1:], 1):
131 |                 if lcsseq_score(gt_line, other_line) >= _SIMILARITY_THRESHOLD:
132 |                     # We found a matching line.
133 |                     htr_index += other_index
134 |                     results.append(line_data(gt_line, other_line, htr_index))
135 |                     break
136 |             else: # "else" for the for loop, not the if stmt!
137 |                 # Nothing sufficiently close. Treat as missing.
138 |                 results.append(line_data(gt_line, '', None))
139 | 
140 |     # Are there any lines in htr_text after the end of the lines in gt_text?
141 |     # If so, add them (as errors) to the results.
142 |     if len(htr_lines) - (htr_index + 1) > 0:
143 |         for index, line in enumerate(htr_lines[htr_index + 1:], htr_index + 1):
144 |             results.append(line_data('', line, index))
145 | 
146 |     # At this point, if there are gaps in the htr line numbers that we
147 |     # stored, it means those are extra lines in the beginning or middle of
148 |     # the htr text.  Find and insert those lines into the results list.
149 |     matched = [line.number for line in results if line.gt_text != '']
150 |     extra_lines = [i for i in range(0, len(htr_lines)) if i not in matched]
151 |     for index in extra_lines:
152 |         # Find the previous location in the results list.  We will insert a
153 |         # new tuple after it.
154 |         for pos, line in enumerate(results):
155 |             if line.number is not None and index < line.number:
156 |                 results.insert(pos, line_data('', htr_lines[index], index))
157 |                 break
158 | 
159 |     # We return data as 4 columns.
160 |     output = ['Errors\tCER (%)\tExpected text\tReceived text']
161 |     total_errors = 0
162 |     for line in results:
163 |         total_errors += line.distance
164 |         output.append('{}\t{}\t{}\t{}'.format(
165 |             line.distance, line.cer, line.gt_text, line.htr_text))
166 |     # Append total errors count, and we're done.
167 |     output.append('Total errors\t\t\t')
168 |     output.append(str(total_errors) + '\t\t\t')
169 |     return '\n'.join(output)
170 | 
171 | 
172 | # Helper functions.
173 | # ......................................................................
174 | 
175 | def line_data(gt_line, htr_line, htr_index):
176 |     # Remove leading spaces and compress runs of spaces in the line.
177 |     expected = ' '.join(gt_line.split())
178 |     obtained = ' '.join(htr_line.split())
179 |     # The stringdist package definition of levenshtein_norm() divides
180 |     # by the longest of the two strings, but it is more conventional in
181 |     # OCR papers and software to divide by the length of the reference.
182 |     from stringdist import levenshtein
183 |     distance = levenshtein(expected, obtained)
184 |     if len(expected) > 0:
185 |         cer = '{:.2f}'.format(100 * float(distance)/len(expected))
186 |     else:
187 |         cer = '100.00'
188 |     return Line(htr_index, distance, cer, expected, obtained)
189 | 


--------------------------------------------------------------------------------
/handprint/credentials/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Handprint module for handling credentials.
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from .base import Credentials
18 | from .amazon_auth import AmazonCredentials
19 | from .google_auth import GoogleCredentials
20 | from .microsoft_auth import MicrosoftCredentials
21 | 


--------------------------------------------------------------------------------
/handprint/credentials/amazon_auth.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | amazon_auth.py: subclass of handprint.credentials.base
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from   commonpy.file_utils import readable
18 | import json
19 | import os
20 | from   os import path
21 | from   sidetrack import log
22 | 
23 | import handprint
24 | from handprint.exceptions import *
25 | 
26 | from .base import Credentials
27 | from .credentials_files import credentials_filename
28 | 
29 | 
30 | # Main class.
31 | # .............................................................................
32 | 
33 | class AmazonCredentials(Credentials):
34 |     def __init__(self):
35 |         cfile = path.join(self.credentials_dir(), credentials_filename('amazon'))
36 |         if __debug__: log(f'credentials file for amazon is {cfile}')
37 |         if not path.exists(cfile):
38 |             raise AuthFailure('Credentials for Amazon have not been installed')
39 |         elif not readable(cfile):
40 |             raise AuthFailure(f'Amazon credentials file unreadable: {cfile}')
41 | 
42 |         try:
43 |             with open(cfile, 'r') as file:
44 |                 self.credentials = json.load(file)
45 |         except Exception as ex:
46 |             raise AuthFailure(f'Unable to parse Amazon exceptions file: {str(ex)}')
47 | 


--------------------------------------------------------------------------------
/handprint/credentials/base.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | base.py: credentials base class
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from   appdirs import user_config_dir
18 | from   commonpy.file_utils import copy_file
19 | import os
20 | from   os import path, makedirs
21 | from   sidetrack import log
22 | 
23 | import handprint
24 | 
25 | from .credentials_files import credentials_filename
26 | 
27 | 
28 | # Main class.
29 | # .............................................................................
30 | 
31 | class Credentials(object):
32 |     creds_dir = user_config_dir('Handprint', 'CaltechLibrary')
33 | 
34 |     def __init__(self):
35 |         self.credentials = None
36 | 
37 | 
38 |     def creds(self):
39 |         return self.credentials
40 | 
41 | 
42 |     @classmethod
43 |     def credentials_dir(self):
44 |         return Credentials.creds_dir
45 | 
46 | 
47 |     @classmethod
48 |     def save_credentials(self, service, supplied_file):
49 |         if not path.isdir(Credentials.creds_dir):
50 |             if __debug__: log(f'creating credentials dir: {Credentials.creds_dir}.')
51 |             makedirs(Credentials.creds_dir)
52 |         dest_file = path.join(Credentials.creds_dir, credentials_filename(service))
53 |         copy_file(supplied_file, dest_file)
54 | 


--------------------------------------------------------------------------------
/handprint/credentials/credentials_files.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | credentials_files.py: mapping of services to credentials files
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | CREDENTIALS_FILES = {
18 |     'amazon'             : 'amazon_credentials.json',
19 |     'amazon-rekognition' : 'amazon_credentials.json',
20 |     'amazon-textract'    : 'amazon_credentials.json',
21 |     'google'             : 'google_credentials.json',
22 |     'microsoft'          : 'microsoft_credentials.json',
23 | }
24 | 
25 | def credentials_filename(service):
26 |     assert service in CREDENTIALS_FILES
27 |     return CREDENTIALS_FILES[service]
28 | 


--------------------------------------------------------------------------------
/handprint/credentials/google_auth.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | google_auth.py: subclass of handprint.credentials.base
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from   commonpy.file_utils import readable
18 | import json
19 | import os
20 | from   os import path
21 | from   sidetrack import log
22 | 
23 | import handprint
24 | from handprint.exceptions import *
25 | 
26 | from .base import Credentials
27 | from .credentials_files import credentials_filename
28 | 
29 | 
30 | # Main class.
31 | # .............................................................................
32 | 
33 | class GoogleCredentials(Credentials):
34 |     def __init__(self):
35 |         cfile = path.join(self.credentials_dir(), credentials_filename('google'))
36 |         if __debug__: log(f'credentials file for google is {cfile}')
37 |         if not path.exists(cfile):
38 |             raise AuthFailure('Credentials for Google have not been installed')
39 |         elif not readable(cfile):
40 |             raise AuthFailure(f'Google credentials file unreadable: {cfile}')
41 | 
42 |         # Haven't been able to make it work; only the environment variable
43 |         # approach has been working for me.
44 |         #
45 |         # with open(self.credentials_file, 'r') as file:
46 |         #     self.credentials = json.load(file)
47 |         os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = cfile
48 | 


--------------------------------------------------------------------------------
/handprint/credentials/microsoft_auth.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | microsoft_auth.py: subclass of handprint.credentials.base
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from   commonpy.file_utils import readable
18 | import json
19 | import os
20 | from   os import path
21 | from   sidetrack import log
22 | 
23 | import handprint
24 | from handprint.exceptions import *
25 | 
26 | from .base import Credentials
27 | from .credentials_files import credentials_filename
28 | 
29 | 
30 | # Constants.
31 | # .............................................................................
32 | 
33 | _DEFAULT_ENDPOINT = 'https://westus.api.cognitive.microsoft.com'
34 | 
35 | 
36 | # Main class.
37 | # .............................................................................
38 | 
39 | class MicrosoftCredentials(Credentials):
40 |     def __init__(self):
41 |         cfile = path.join(self.credentials_dir(), credentials_filename('microsoft'))
42 |         if __debug__: log(f'credentials file for microsoft is {cfile}')
43 |         if not path.exists(cfile):
44 |             raise AuthFailure('Credentials for Microsoft have not been installed')
45 |         elif not readable(cfile):
46 |             raise AuthFailure(f'Microsoft credentials file unreadable: {cfile}')
47 | 
48 |         try:
49 |             with open(cfile, 'r') as file:
50 |                 creds = json.load(file)
51 |                 if 'endpoint' in creds:
52 |                     endpoint = creds['endpoint'].rstrip('/')
53 |                     if not endpoint.startswith('http'):
54 |                         endpoint = 'https://' + endpoint
55 |                 else:
56 |                     if __debug__: log('endpoint not found; using default')
57 |                     endpoint = _DEFAULT_ENDPOINT
58 |                 creds['endpoint'] = endpoint
59 |                 self.credentials = creds
60 |         except Exception as ex:
61 |             raise AuthFailure(f'Unable to parse Microsoft exceptions file: {str(ex)}')
62 | 


--------------------------------------------------------------------------------
/handprint/exceptions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | exceptions.py: exceptions defined by Handprint
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | 
18 | # Base class.
19 | # .............................................................................
20 | # The base class makes it possible to use a single test to distinguish between
21 | # exceptions generated by Handprint and exceptions generated by something else.
22 | 
23 | class HandprintException(Exception):
24 |     '''Base class for Handprint exceptions.'''
25 |     pass
26 | 
27 | 
28 | # Exception classes.
29 | # .............................................................................
30 | 
31 | class CannotProceed(HandprintException):
32 |     '''A recognizable condition caused an early exit from the program.'''
33 |     pass
34 | 
35 | class UserCancelled(HandprintException):
36 |     '''The user elected to cancel/quit the program.'''
37 |     pass
38 | 
39 | class NetworkFailure(HandprintException):
40 |     '''Unrecoverable problem involving network operations.'''
41 |     pass
42 | 
43 | class NoContent(HandprintException):
44 |     '''No content found at the given location.'''
45 |     pass
46 | 
47 | class CorruptedContent(HandprintException):
48 |     '''Content corruption has been detected.'''
49 |     pass
50 | 
51 | class AuthFailure(HandprintException):
52 |     '''Problem obtaining or using authentication credentials.'''
53 |     pass
54 | 
55 | class ServiceFailure(HandprintException):
56 |     '''Unrecoverable problem involving a remote service.'''
57 |     pass
58 | 
59 | class RateLimitExceeded(HandprintException):
60 |     '''The service flagged reports that its rate limits have been exceeded.'''
61 |     pass
62 | 
63 | class InternalError(HandprintException):
64 |     '''Unrecoverable problem involving Handprint itself.'''
65 |     pass
66 | 


--------------------------------------------------------------------------------
/handprint/exit_codes.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | exit_codes.py: define exit codes for program return values
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2020-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from aenum import Enum, MultiValue
18 | 
19 | # I adapted the clever approach posted by the author of the Python aenum
20 | # package, Ethan Furman, to Stack Overflow on 2016-03-13 at
21 | # https://stackoverflow.com/a/35964875/743730
22 | # The most important bit is realizing you can define __int__().
23 | 
24 | class ExitCode(Enum):
25 |     '''Class of exit codes that this program may return.
26 | 
27 |     The numeric value of a given code can be obtained by using int().  For
28 |     example, int(ExitCode.success) will produce 0.
29 |     '''
30 | 
31 |     _init_ = 'value meaning'
32 |     _settings_ = MultiValue
33 | 
34 |     success        = 0, "success -- program completed normally"
35 |     user_interrupt = 1, "the user interrupted the program's execution"
36 |     bad_arg        = 2, "encountered a bad or missing value for an option"
37 |     no_network     = 3, "no network detected -- cannot proceed"
38 |     file_error     = 4, "file error -- encountered a problem with a file or directory"
39 |     server_error   = 5, "server error -- encountered a problem with the server"
40 |     exception      = 6, "an exception or fatal error occurred"
41 | 
42 |     def __int__(self):
43 |         return self.value
44 | 


--------------------------------------------------------------------------------
/handprint/main_body.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | main_body.py: main loop for Handprint
  3 | 
  4 | Authors
  5 | -------
  6 | 
  7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
  8 | 
  9 | Copyright
 10 | ---------
 11 | 
 12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
 13 | is open-source software released under a 3-clause BSD license.  Please see the
 14 | file "LICENSE" for more information.
 15 | '''
 16 | 
 17 | from   bun import inform, alert, alert_fatal, warn
 18 | from   commonpy.interrupt import raise_for_interrupts
 19 | from   commonpy.data_utils import pluralized
 20 | from   commonpy.file_utils import filename_extension, filename_basename
 21 | from   commonpy.file_utils import files_in_directory, readable, writable
 22 | from   commonpy.string_utils import antiformat
 23 | import os
 24 | from   os.path import isfile, isdir, exists
 25 | import sys
 26 | 
 27 | if __debug__:
 28 |     from sidetrack import log
 29 | 
 30 | import handprint
 31 | from handprint import _OUTPUT_EXT, _OUTPUT_FORMAT
 32 | from handprint.credentials import Credentials
 33 | from handprint.exceptions import *
 34 | from handprint.exit_codes import ExitCode
 35 | from handprint.services import ACCEPTED_FORMATS, services_list
 36 | 
 37 | 
 38 | # Exported classes.
 39 | # .............................................................................
 40 | 
 41 | class MainBody(object):
 42 |     '''Main body for Handprint.'''
 43 | 
 44 |     def __init__(self, **kwargs):
 45 |         '''Initialize internal state.'''
 46 | 
 47 |         # Assign parameters to self to make them available within this object.
 48 |         for key, value in kwargs.items():
 49 |             if __debug__: log(f'parameter value self.{key} = {value}')
 50 |             setattr(self, key, value)
 51 | 
 52 |         # We expose an attribute "exception" that callers can use to find out
 53 |         # if the thread finished normally or with an exception.
 54 |         self.exception = None
 55 | 
 56 |         # The manager object manages the process of manipulating images and
 57 |         # sending them to the services.
 58 |         from handprint.manager import Manager
 59 |         self._manager = Manager(self.services, self.threads, self.output_dir,
 60 |                                 self.make_grid, self.compare, self.extended,
 61 |                                 self.text_size, self.text_color, self.text_shift,
 62 |                                 self.display, self.confidence, self.reuse_json)
 63 | 
 64 | 
 65 |     def run(self):
 66 |         '''Run the main body.'''
 67 | 
 68 |         if __debug__: log('running MainBody')
 69 |         try:
 70 |             self._do_preflight()
 71 |             self._do_main_work()
 72 |         except Exception as ex:
 73 |             if __debug__: log(f'exception in main body: {antiformat(str(ex))}')
 74 |             self.exception = sys.exc_info()
 75 |         if __debug__: log('finished MainBody')
 76 | 
 77 | 
 78 |     def stop(self):
 79 |         if __debug__: log('stopping ...')
 80 |         self._manager.stop_services()
 81 | 
 82 | 
 83 |     def _do_preflight(self):
 84 |         '''Check the option values given by the user, and do other prep.'''
 85 | 
 86 |         from commonpy.network_utils import network_available
 87 |         if not network_available():
 88 |             alert_fatal('No network connection.')
 89 |             raise CannotProceed(ExitCode.no_network)
 90 | 
 91 |         if self.from_file:
 92 |             if not exists(self.from_file):
 93 |                 alert_fatal(f'File not found: {self.from_file}')
 94 |                 raise CannotProceed(ExitCode.bad_arg)
 95 |             if not readable(self.from_file):
 96 |                 alert_fatal(f'File not readable: {self.from_file}')
 97 |                 raise CannotProceed(ExitCode.file_error)
 98 | 
 99 |         if self.output_dir:
100 |             if isdir(self.output_dir):
101 |                 if not writable(self.output_dir):
102 |                     alert_fatal(f'Directory not writable: {self.output_dir}')
103 |                     raise CannotProceed(ExitCode.file_error)
104 |             else:
105 |                 os.mkdir(self.output_dir)
106 |                 if __debug__: log(f'created output_dir directory {self.output_dir}')
107 | 
108 | 
109 |     def _do_main_work(self):
110 |         # Gather up some things and get prepared.
111 |         targets = self.targets_from_arguments()
112 |         if not targets:
113 |             alert_fatal('No images to process; quitting.')
114 |             raise CannotProceed(ExitCode.bad_arg)
115 |         num_targets = len(targets)
116 | 
117 |         inform(f'Given {pluralized("image", num_targets, True)} to work on.')
118 |         inform('Will apply results of {}: {}'.format(
119 |             pluralized('service', len(self.services), True),
120 |             ', '.join(self.services), num_targets))
121 |         inform(f'Will use credentials stored in {Credentials.credentials_dir()}/.')
122 |         if self.extended:
123 |             inform('Will save extended results.')
124 |         num_threads = min(self.threads, len(self.services))
125 |         inform(f'Will use up to {num_threads} process threads.')
126 | 
127 |         # Get to work.
128 |         if __debug__: log('initializing manager and starting processes')
129 |         import shutil
130 |         print_separators = num_targets > 1
131 |         rule = '─'*(shutil.get_terminal_size().columns or 80)
132 |         for index, item in enumerate(targets, start = 1):
133 |             # Check whether we've been interrupted before doing another item.
134 |             raise_for_interrupts()
135 |             # Process next item.
136 |             if print_separators:
137 |                 inform(rule)
138 |             self._manager.run_services(item, index, self.base_name)
139 |         if print_separators:
140 |             inform(rule)
141 | 
142 | 
143 |     def targets_from_arguments(self):
144 |         # Validator_collection takes a long time to load.  Delay loading it
145 |         # until needed, so that overall application startup time is faster.
146 |         from validator_collection.checkers import is_url
147 | 
148 |         targets = []
149 |         if self.from_file:
150 |             if __debug__: log(f'reading {self.from_file}')
151 |             targets = filter(None, open(self.from_file).read().splitlines())
152 |         else:
153 |             for item in self.files:
154 |                 if is_url(item):
155 |                     targets.append(item)
156 |                 elif isfile(item) and filename_extension(item) in ACCEPTED_FORMATS:
157 |                     targets.append(item)
158 |                 elif isdir(item):
159 |                     # It's a directory, so look for files within.
160 |                     targets += files_in_directory(item, extensions = ACCEPTED_FORMATS)
161 |                 else:
162 |                     warn(f'"{item}" not a file or directory')
163 | 
164 |         # Filter files created in past runs.
165 |         targets = filter(lambda name: '.handprint' not in name, targets)
166 | 
167 |         # If there is both a file in the format we generate and another
168 |         # format of that file, ignore the other formats and just use ours.
169 |         # Note: the value of targets is an iterator, but b/c it's tested inside
170 |         # the loop, a separate list is needed (else get unexpected results).
171 |         targets = list(targets)
172 |         keep = []
173 |         for item in targets:
174 |             ext  = filename_extension(item)
175 |             base = filename_basename(item)
176 |             if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets):
177 |                 # png version of file is also present => skip this other version
178 |                 continue
179 |             keep.append(item)
180 |         return keep
181 | 


--------------------------------------------------------------------------------
/handprint/services/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Handprint module for interfacing to text recognition cloud services.
 3 | 
 4 | Authors
 5 | -------
 6 | 
 7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
 8 | 
 9 | Copyright
10 | ---------
11 | 
12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
13 | is open-source software released under a 3-clause BSD license.  Please see the
14 | file "LICENSE" for more information.
15 | '''
16 | 
17 | from .amazon import AmazonRekognitionTR, AmazonTextractTR
18 | from .google import GoogleTR
19 | from .microsoft import MicrosoftTR
20 | 
21 | ACCEPTED_FORMATS = ('.jpg', '.jpeg', '.jp2', '.pdf', '.png', '.gif', '.bmp',
22 |                     '.tif', '.tiff')
23 | 
24 | KNOWN_SERVICES = {
25 |     'amazon-rekognition': AmazonRekognitionTR,
26 |     'amazon-textract': AmazonTextractTR,
27 |     'google': GoogleTR,
28 |     'microsoft': MicrosoftTR,
29 | }
30 | 
31 | # Save this list to avoid recreating it all the time.
32 | SERVICES_LIST = sorted(KNOWN_SERVICES.keys())
33 | 
34 | def services_list():
35 |     return SERVICES_LIST
36 | 


--------------------------------------------------------------------------------
/handprint/services/amazon.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | amazon.py: interface to Amazon network services Rekognition and Textract
  3 | 
  4 | Authors
  5 | -------
  6 | 
  7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
  8 | 
  9 | Copyright
 10 | ---------
 11 | 
 12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
 13 | is open-source software released under a 3-clause BSD license.  Please see the
 14 | file "LICENSE" for more information.
 15 | '''
 16 | 
 17 | from   commonpy.file_utils import readable, relative
 18 | from   commonpy.interrupt import raise_for_interrupts
 19 | import imagesize
 20 | import os
 21 | import sys
 22 | 
 23 | if __debug__:
 24 |     from sidetrack import log
 25 | 
 26 | import handprint
 27 | from handprint.credentials.amazon_auth import AmazonCredentials
 28 | from handprint.exceptions import *
 29 | from handprint.services.base import TextRecognition, TRResult, Box
 30 | 
 31 | 
 32 | # Main class.
 33 | # .............................................................................
 34 | 
 35 | class AmazonTR(TextRecognition):
 36 |     '''Base class for Amazon text recognition services.'''
 37 | 
 38 |     def init_credentials(self):
 39 |         '''Initializes the credentials to use for accessing this service.'''
 40 |         try:
 41 |             if __debug__: log('initializing credentials')
 42 |             self._credentials = AmazonCredentials().creds()
 43 |         except Exception as ex:
 44 |             raise AuthFailure(str(ex))
 45 | 
 46 | 
 47 |     def max_rate(self):
 48 |         '''Returns the number of calls allowed per second.'''
 49 |         # https://docs.aws.amazon.com/general/latest/gr/aws_service_limits.html#limits_textract
 50 |         return 0.25
 51 | 
 52 | 
 53 |     def max_size(self):
 54 |         '''Returns the maximum size of an acceptable image, in bytes.'''
 55 |         # https://docs.aws.amazon.com/textract/latest/dg/textract-dg.pdf#limits
 56 |         return 10*1024*1024
 57 | 
 58 | 
 59 |     def max_dimensions(self):
 60 |         '''Maximum image size as a tuple of pixel numbers: (width, height).
 61 |         A value of None indicates the limits are unknown.'''
 62 |         # I can't find a limit stated in the Amazon docs.
 63 |         return None
 64 | 
 65 | 
 66 |     # General scheme of things:
 67 |     #
 68 |     # * Return errors (via TRResult) if a result could not be obtained
 69 |     #   because of an error specific to a particular path/item.  The guiding
 70 |     #   principle here is: if the calling loop is processing multiple items,
 71 |     #   can it be expected to be able to go on to the next item if this error
 72 |     #   occurred?
 73 |     #
 74 |     # * Raises exceptions if a problem occurs that should stop the calling
 75 |     #   code from continuing with this service.  This includes things like
 76 |     #   authentication failures, because authentication failures tend to
 77 |     #   involve all uses of a service and not just a specific item.
 78 |     #
 79 |     # * Otherwise, returns a TRResult if successful.
 80 | 
 81 |     def amazon_result(self, file_path, variant, method, image_keyword,
 82 |                       result_key, value_key, block_key, result):
 83 |         '''Returns the result from calling the service on the 'file_path'.
 84 |         The result is returned as an TRResult named tuple.
 85 |         '''
 86 | 
 87 |         # Delay loading the API packages until needed because they take time to
 88 |         # load.  Doing this speeds up overall application start time.
 89 |         import boto3
 90 |         import botocore
 91 | 
 92 |         if not result:
 93 |             # If any exceptions occur, let them be passed to caller.
 94 |             (image, error) = self._image_from_file(file_path)
 95 |             if error:
 96 |                 return TRResult(path = file_path, data = {}, boxes = [],
 97 |                                 text = '', error = error)
 98 |             try:
 99 |                 if __debug__: log(f'setting up Amazon client function "{variant}"')
100 |                 creds = self._credentials
101 |                 session = boto3.session.Session()
102 |                 client = session.client(variant, region_name = creds['region_name'],
103 |                                         aws_access_key_id = creds['aws_access_key_id'],
104 |                                         aws_secret_access_key = creds['aws_secret_access_key'])
105 |                 if __debug__: log('calling Amazon API function')
106 |                 result = getattr(client, method)( **{ image_keyword : {'Bytes': image} })
107 |                 if __debug__: log(f'received {len(result[result_key])} blocks')
108 |             except botocore.exceptions.EndpointConnectionError as ex:
109 |                 raise AuthFailure(f'Problem with credentials file -- {str(ex)}')
110 |             except KeyboardInterrupt as ex:
111 |                 raise
112 |             except KeyError as ex:
113 |                 msg = f'Amazon credentials file is missing {",".join(ex.args)}'
114 |                 raise AuthFailure(msg)
115 |             except Exception as ex:
116 |                 if getattr(ex, 'response', False) and 'Error' in ex.response:
117 |                     error = ex.response['Error']
118 |                     code = error['Code']
119 |                     text = error['Message']
120 |                     path = relative(file_path)
121 |                     if code in ['UnsupportedDocumentException', 'BadDocumentException']:
122 |                         msg = f'Amazon {variant} reports bad or corrupted image in {path}'
123 |                         raise CorruptedContent(msg)
124 |                     elif code in ['InvalidSignatureException', 'UnrecognizedClientException']:
125 |                         raise AuthFailure(f'Problem with credentials file -- {text}')
126 |                 # Fallback if we can't get details.
127 |                 if __debug__: log(f'Amazon returned exception {str(ex)}')
128 |                 msg = f'Amazon {variant} failure for {path} -- {error["Message"]}'
129 |                 raise ServiceFailure(msg)
130 | 
131 |         raise_for_interrupts()
132 |         full_text = ''
133 |         boxes = []
134 |         width, height = imagesize.get(file_path)
135 |         if __debug__: log(f'parsing Amazon result for {relative(file_path)}')
136 |         for block in result[result_key]:
137 |             if value_key not in block:
138 |                 continue
139 |             kind = block[value_key].lower()
140 |             if kind in ['word', 'line']:
141 |                 text = block[block_key]
142 |                 corners = corner_list(block['Geometry']['Polygon'], width, height)
143 |                 if corners:
144 |                     boxes.append(Box(kind = kind, bb = corners, text = text,
145 |                                      score = block['Confidence'] / 100))
146 |                 else:
147 |                     # Something's wrong with the vertex list. Skip & continue.
148 |                     if __debug__: log(f'bad bb for {text}: {bb}')
149 |             if kind == "line":
150 |                 if 'Text' in block:
151 |                     full_text += block['Text'] + '\n'
152 |                 elif 'DetectedText' in block:
153 |                     full_text += block['DetectedText'] + '\n'
154 |         return TRResult(path = file_path, data = result, boxes = boxes,
155 |                         text = full_text, error = None)
156 | 
157 | 
158 | class AmazonTextractTR(AmazonTR):
159 |     '''Subclass of AmazonTR for the Textract service.'''
160 | 
161 |     @classmethod
162 |     def name(self):
163 |         '''Returns the canonical internal name for this service.'''
164 |         return "amazon-textract"
165 | 
166 | 
167 |     @classmethod
168 |     def name_color(self):
169 |         '''Returns a color code for this service.  See the color definitions
170 |         in messages.py.'''
171 |         return 'light_goldenrod2'
172 | 
173 | 
174 |     def result(self, file_path, saved_result):
175 |         '''Returns the result from calling the service on the 'file_path'.
176 |         The result is returned as an TRResult named tuple.
177 |         '''
178 |         return self.amazon_result(file_path, 'textract',
179 |                                   'detect_document_text',
180 |                                   'Document',
181 |                                   'Blocks',     # result_key
182 |                                   'BlockType',  # value_key
183 |                                   'Text',       # block_key
184 |                                   saved_result)
185 | 
186 | 
187 | class AmazonRekognitionTR(AmazonTR):
188 |     '''Subclass of AmazonTR for the Rekognition service.'''
189 | 
190 |     @classmethod
191 |     def name(self):
192 |         '''Returns the canonical internal name for this service.'''
193 |         return "amazon-rekognition"
194 | 
195 | 
196 |     @classmethod
197 |     def name_color(self):
198 |         '''Returns a color code for this service.  See the color definitions
199 |         in messages.py.'''
200 |         return 'dark_orange'
201 | 
202 | 
203 |     def result(self, file_path, saved_result = None):
204 |         '''Returns the result from calling the service on the 'file_path'.
205 |         The result is returned as an TRResult named tuple.
206 |         '''
207 |         return self.amazon_result(file_path, 'rekognition',
208 |                                   'detect_text',
209 |                                   'Image',
210 |                                   'TextDetections', # result_key
211 |                                   'Type',           # value_key
212 |                                   'DetectedText',   # block_key
213 |                                   saved_result)
214 | 
215 | # Miscellaneous utilities.
216 | # .............................................................................
217 | 
218 | def corner_list(polygon, width, height):
219 |     '''Takes a boundingBox value from Amazon's JSON output and returns
220 |     a condensed version, in the form [x y x y x y x y], with the first x, y
221 |     pair representing the upper left corner.'''
222 |     corners = []
223 |     for poly_corner in polygon:
224 |         if 'X' in poly_corner and 'Y' in poly_corner:
225 |             # Results  are in percentages of the image.  Convert to pixels.
226 |             corners.append(int(round(poly_corner['X'] * width)))
227 |             corners.append(int(round(poly_corner['Y'] * height)))
228 |         else:
229 |             return []
230 |     return corners
231 | 


--------------------------------------------------------------------------------
/handprint/services/base.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | base.py: base class definition for text recognition systems.
  3 | 
  4 | Authors
  5 | -------
  6 | 
  7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
  8 | 
  9 | Copyright
 10 | ---------
 11 | 
 12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
 13 | is open-source software released under a 3-clause BSD license.  Please see the
 14 | file "LICENSE" for more information.
 15 | '''
 16 | 
 17 | from   collections import namedtuple
 18 | from   commonpy.file_utils import readable, relative
 19 | import imagesize
 20 | 
 21 | if __debug__:
 22 |     from sidetrack import log
 23 | 
 24 | 
 25 | # Named tuple definitions.
 26 | # .............................................................................
 27 | 
 28 | TRResult = namedtuple('TRResult', 'path data text boxes error')
 29 | TRResult.__doc__ = '''Results of invoking a text recognition service.
 30 |   'path' is the file path or URL of the item in question
 31 |   'data' is the full data result as a Python dict (or {} in case of error)
 32 |   'text' is the extracted text as a string (or '' in case of error)
 33 |   'boxes' is a list of text boxes
 34 |   'error' is None if no error occurred, or the text of any error messages
 35 | '''
 36 | 
 37 | Box = namedtuple('Box', 'kind bb text score')
 38 | Box.__doc__ = '''Representation of a single box, possibly containing text.
 39 |   'kind' is the type; this can be "word", "line", "paragraph".
 40 |   'bb' is the bounding box, as XY coordinates of corners starting with u.l.
 41 |   'text' is text (when the box contains text).
 42 |   'score' is the confidence score given to this item by the service.
 43 | '''
 44 | 
 45 | 
 46 | # Class definitions.
 47 | # .............................................................................
 48 | # Basics for the __eq__ etc. methods came from
 49 | # https://stackoverflow.com/questions/1061283/lt-instead-of-cmp
 50 | 
 51 | class TextRecognition(object):
 52 |     def __init__(self):
 53 |         pass
 54 | 
 55 | 
 56 |     def __str__(self):
 57 |         return self.name()
 58 | 
 59 | 
 60 |     def __repr__(self):
 61 |         return self.name()
 62 | 
 63 | 
 64 |     def __eq__(self, other):
 65 |         if isinstance(other, type(self)):
 66 |             return self.__dict__ == other.__dict__
 67 |         return NotImplemented
 68 | 
 69 | 
 70 |     def __ne__(self, other):
 71 |         # Based on lengthy Stack Overflow answer by user "Maggyero" posted on
 72 |         # 2018-06-02 at https://stackoverflow.com/a/50661674/743730
 73 |         eq = self.__eq__(other)
 74 |         if eq is not NotImplemented:
 75 |             return not eq
 76 |         return NotImplemented
 77 | 
 78 | 
 79 |     def __lt__(self, other):
 80 |         return self.name() < other.name()
 81 | 
 82 | 
 83 |     def __gt__(self, other):
 84 |         if isinstance(other, type(self)):
 85 |             return other.name() < self.name()
 86 |         return NotImplemented
 87 | 
 88 | 
 89 |     def __le__(self, other):
 90 |         if isinstance(other, type(self)):
 91 |             return not other.name() < self.name()
 92 |         return NotImplemented
 93 | 
 94 | 
 95 |     def __ge__(self, other):
 96 |         if isinstance(other, type(self)):
 97 |             return not self.name() < other.name()
 98 |         return NotImplemented
 99 | 
100 | 
101 |     def init_credentials(self):
102 |         '''Initializes the credentials to use for accessing this service.'''
103 |         pass
104 | 
105 | 
106 |     def name(self):
107 |         '''Returns the canonical internal name for this service.'''
108 |         pass
109 | 
110 | 
111 |     def name_color(self):
112 |         '''Returns a color code for this service.  See the color definitions
113 |         in messages.py.'''
114 |         pass
115 | 
116 | 
117 |     def max_rate(self):
118 |         '''Returns the number of calls allowed per second.'''
119 |         pass
120 | 
121 | 
122 |     def max_size(self):
123 |         '''Returns the maximum size of an acceptable image, in bytes.'''
124 |         pass
125 | 
126 | 
127 |     def max_dimensions(self):
128 |         '''Maximum image size as a tuple of pixel numbers: (width, height).'''
129 |         pass
130 | 
131 | 
132 |     def result(self, path, saved_result = None):
133 |         '''Returns the text recognition results from the service as an
134 |         TRResult named tuple.  If a saved result is supplied, use that.
135 |         '''
136 |         pass
137 | 
138 | 
139 |     def _image_from_file(self, file_path):
140 |         '''Helper function for subclasses to read image files.
141 |         Returns a tuple, (image, error), where "error" is a TRResult with a
142 |         non-empty error field value if an error occurred, and "image" is the
143 |         bytes of the image if it was successfully read.
144 |         '''
145 | 
146 |         def error_result(error_text):
147 |             return (None, TRResult(path = file_path, data = {}, text = '',
148 |                                    error = error_text, boxes = []))
149 | 
150 |         rel_path = relative(file_path)
151 |         if not readable(file_path):
152 |             return error_result(f'Unable to read file: {rel_path}')
153 |         if __debug__: log(f'reading {rel_path} for {self.name()}')
154 |         with open(file_path, 'rb') as image_file:
155 |             image = image_file.read()
156 |         if len(image) == 0:
157 |             return error_result(f'Empty file: {rel_path}')
158 |         if len(image) > self.max_size():
159 |             text = f'Exceeds {self.max_size()} byte limit for service: {rel_path}'
160 |             return error_result(text)
161 |         width, height = imagesize.get(file_path)
162 |         if __debug__: log(f'image size is width = {width}, height = {height}')
163 |         if self.max_dimensions():
164 |             max_width, max_height = self.max_dimensions()
165 |             if width > max_width or height > max_height:
166 |                 text = f'Dimensions {width}x{height} exceed {self.name()} limits: {rel_path}'
167 |                 return error_result(text)
168 |         return (image, None)
169 | 


--------------------------------------------------------------------------------
/handprint/services/google.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | google.py: interface to Google text recognition network service
  3 | 
  4 | Authors
  5 | -------
  6 | 
  7 | Michael Hucka <mhucka@caltech.edu> -- Caltech Library
  8 | 
  9 | Copyright
 10 | ---------
 11 | 
 12 | Copyright (c) 2018-2022 by the California Institute of Technology.  This code
 13 | is open-source software released under a 3-clause BSD license.  Please see the
 14 | file "LICENSE" for more information.
 15 | '''
 16 | 
 17 | from   commonpy.file_utils import relative
 18 | from   commonpy.interrupt import raise_for_interrupts
 19 | import io
 20 | import json
 21 | import math
 22 | import os
 23 | import json
 24 | 
 25 | if __debug__:
 26 |     from sidetrack import log
 27 | 
 28 | import handprint
 29 | from handprint.credentials.google_auth import GoogleCredentials
 30 | from handprint.exceptions import *
 31 | from handprint.services.base import TextRecognition, TRResult, Box
 32 | 
 33 | 
 34 | # Main class.
 35 | # .............................................................................
 36 | 
 37 | class GoogleTR(TextRecognition):
 38 |     # The following is based on the table of Google Cloud Vision features at
 39 |     # https://cloud.google.com/vision/docs/reference/rpc/google.cloud.vision.v1p3beta1#type_1
 40 |     # as of 2018-10-25.
 41 |     _known_features = ['document_text_detection']
 42 | 
 43 | 
 44 |     def init_credentials(self):
 45 |         '''Initializes the credentials to use for accessing this service.'''
 46 |         try:
 47 |             if __debug__: log('initializing credentials')
 48 |             GoogleCredentials()
 49 |         except Exception as ex:
 50 |             raise AuthFailure(str(ex))
 51 | 
 52 | 
 53 |     @classmethod
 54 |     def name(self):
 55 |         '''Returns the canonical internal name for this service.'''
 56 |         return "google"
 57 | 
 58 | 
 59 |     @classmethod
 60 |     def name_color(self):
 61 |         '''Returns a color code for this service.  See the color definitions
 62 |         in messages.py.'''
 63 |         return 'deep_sky_blue1'
 64 | 
 65 | 
 66 |     def max_rate(self):
 67 |         '''Returns the number of calls allowed per second.'''
 68 |         # https://cloud.google.com/vision/quotas
 69 |         return 30
 70 | 
 71 | 
 72 |     def max_size(self):
 73 |         '''Returns the maximum size of an acceptable image, in bytes.'''
 74 |         # https://cloud.google.com/vision/docs/supported-files
 75 |         # Google Cloud Vision API docs state that images can't exceed 20 MB
 76 |         # but the JSON request size limit is 10 MB.  We hit the 10 MB limit
 77 |         # even though we're using the Google API library, which I guess must
 78 |         # be transferring JSON under the hood.
 79 |         return 10*1024*1024
 80 | 
 81 | 
 82 |     def max_dimensions(self):
 83 |         '''Maximum image size as a tuple of pixel numbers: (width, height).'''
 84 |         # No max dimensions are given in the Google docs.
 85 |         return None
 86 | 
 87 | 
 88 |     # General scheme of things:
 89 |     #
 90 |     # * Return errors (via TRResult) if a result could not be obtained
 91 |     #   because of an error specific to a particular path/item.  The guiding
 92 |     #   principle here is: if the calling loop is processing multiple items,
 93 |     #   can it be expected to be able to go on to the next item if this error
 94 |     #   occurred?
 95 |     #
 96 |     # * Raises exceptions if a problem occurs that should stop the calling
 97 |     #   code from continuing with this service.  This includes things like
 98 |     #   authentication failures, because authentication failures tend to
 99 |     #   involve all uses of a service and not just a specific item.
100 |     #
101 |     # * Otherwise, returns a TRResult if successful.
102 | 
103 |     def result(self, path, result = None):
104 |         '''Returns the result from calling the service on the 'file_path'.
105 |         The result is returned as an TRResult named tuple.
106 |         '''
107 | 
108 |         # Delay loading the API packages until needed because they take time to
109 |         # load.  Doing this speeds up overall application start time.
110 |         import google
111 |         from google.cloud import vision_v1 as gv
112 |         from google.api_core.exceptions import PermissionDenied
113 |         from google.protobuf.json_format import MessageToDict
114 | 
115 |         if not result:
116 |             # Read the image and proceed with contacting the service.
117 |             (image, error) = self._image_from_file(path)
118 |             if error:
119 |                 return error
120 | 
121 |             if __debug__: log(f'building Google API object for {relative(path)}')
122 |             try:
123 |                 client  = gv.ImageAnnotatorClient()
124 |                 params  = gv.TextDetectionParams(
125 |                     mapping = { 'enable_text_detection_confidence_score': True })
126 |                 context = gv.ImageContext(language_hints = ['en-t-i0-handwrit'],
127 |                                           text_detection_params = params)
128 |                 img     = gv.Image(content = image)
129 |                 if __debug__: log(f'sending image to Google for {relative(path)} ...')
130 |                 response = client.document_text_detection(image = img,
131 |                                                           image_context = context)
132 |                 if __debug__: log(f'received result from Google for {relative(path)}')
133 |                 result = dict_from_response(response)
134 |             except google.api_core.exceptions.PermissionDenied as ex:
135 |                 text = 'Authentication failure for Google service -- {}'.format(ex)
136 |                 raise AuthFailure(text)
137 |             except google.auth.exceptions.DefaultCredentialsError as ex:
138 |                 text = 'Credentials file error for Google service -- {}'.format(ex)
139 |                 raise AuthFailure(text)
140 |             except google.api_core.exceptions.ServiceUnavailable as ex:
141 |                 text = 'Network, service, or Google configuration error -- {}'.format(ex)
142 |                 raise ServiceFailure(text)
143 |             except KeyboardInterrupt as ex:
144 |                 raise
145 |             except Exception as ex:
146 |                 if isinstance(ex, KeyError):
147 |                     # Can happen if you control-C in the middle of the Google call.
148 |                     # Result is "Exception ignored in: 'grpc._cython.cygrpc._next'"
149 |                     # printed to the terminal and we end up here.
150 |                     raise KeyboardInterrupt
151 |                 else:
152 |                     text = 'Error: {} -- {}'.format(str(ex), path)
153 |                     return TRResult(path = path, data = {}, boxes = [],
154 |                                     text = '', error = text)
155 | 
156 |         raise_for_interrupts()
157 |         boxes = []
158 |         # See this page for more information about the structure:
159 |         # https://cloud.google.com/vision/docs/handwriting#python
160 |         if len(result['full_text_annotation']['pages']) > 1:
161 |             warn('More than one page received from Google; using only first.')
162 |         for block in result['full_text_annotation']['pages'][0]['blocks']:
163 |             for para in block['paragraphs']:
164 |                 corners = corner_list(para['bounding_box']['vertices'])
165 |                 boxes.append(Box(bb = corners, kind = 'para', text = '',
166 |                                  score = para['confidence']))
167 |                 for word in para['words']:
168 |                     text = ''
169 |                     for symbol in word['symbols']:
170 |                         text += symbol['text']
171 |                     corners = corner_list(word['bounding_box']['vertices'])
172 |                     if corners:
173 |                         boxes.append(Box(bb = corners, kind = 'word',
174 |                                          text = text, score = para['confidence']))
175 |                     else:
176 |                         # Something is wrong with the vertex list.
177 |                         # Skip it and continue.
178 |                         if __debug__: log(f'bad bb for {text}: {bb}')
179 |         full_text = result['full_text_annotation']['text']
180 |         return TRResult(path = path, data = result,
181 |                         boxes = boxes, text = full_text, error = None)
182 | 
183 | 
184 | # Miscellaenous utilities
185 | # .............................................................................
186 | 
187 | # Grrrr.  The Google API can return incomplete vertices for a bounding box.
188 | # In one of our sample images ("pbm-2421-PBM_3_1_1_0016"), I get this result:
189 | # [{'x': 2493}, {'x': 2538, 'y': 1}, {'x': 2535, 'y': 154}, {'x': 2490, 'y': 153}]
190 | # So, we have to test to make sure both 'x' and 'y' keys are in every vertex.
191 | 
192 | def corner_list(vertices):
193 |     '''Takes a boundingBox value from Google Vision's output and returns
194 |     a condensed version, in the form [x y x y x y x y], with the first x, y
195 |     pair representing the upper left corner.'''
196 |     corners = []
197 |     if len(vertices) < 4:
198 |         return []
199 |     for vertex in vertices:
200 |         corners.append(vertex['x'])
201 |         corners.append(vertex['y'])
202 |     return corners
203 | 
204 | 
205 | # In more recent versions of googleapis-common-protos, MessageToDict is no
206 | # longer directly available as it was before.  See this GitHub issue answer:
207 | # https://github.com/googleapis/python-memcache/issues/19#issuecomment-708516816
208 | # The following builds on an answer by user Tobiasz Kędzierski given at
209 | # https://github.com/googleapis/python-memcache/issues/19#issuecomment-709628506
210 | 
211 | def dict_from_response(response):
212 |     import google
213 |     if isinstance(response, google.cloud.vision_v1.types.image_annotator.AnnotateImageResponse):
214 |         return response.__class__.to_dict(response)
215 |     else:
216 |         return MessageToDict(response)
217 | 
218 | 
219 | def json_from_response(response):
220 |     return json.dumps(dict_from_response(response))
221 | 


--------------------------------------------------------------------------------
/pubs/joss/.gitignore:
--------------------------------------------------------------------------------
1 | auto
2 | whedon
3 | paper.pdf
4 | 


--------------------------------------------------------------------------------
/pubs/joss/H96566k.handprint-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/pubs/joss/H96566k.handprint-all.png


--------------------------------------------------------------------------------
/pubs/joss/Makefile:
--------------------------------------------------------------------------------
  1 | # =============================================================================
  2 | # @file    Makefile
  3 | # @brief   Makefile for generating a PDF version of a JOSS paper, to preview it
  4 | # @author  Michael Hucka <mhucka@caltech.edu>
  5 | # @license https://github.com/caltechlibrary/template/blob/main/LICENSE
  6 | # @website https://gist.github.com/mhucka/c0b82778417f38f7ae6ee7d051cec90a
  7 | #
  8 | # The procedure for using this is very simple:
  9 | #   1. Create a directory for your paper
 10 | #   2. Copy your paper's Markdown, bib, & other source files into the directory
 11 | #   3. Copy this Makefile into the directory
 12 | #   4. Update this Makefile's configuration variables (see below)
 13 | #   5. Run "make setup"
 14 | #   6. Run "make paper"
 15 | #
 16 | # To get an auto-refreshed preview while editing, open the PDF file in a PDF
 17 | # viewer that auto-refreshes when it detects changes to the PDF file (e.g.,
 18 | # Skim on macOS), and run "make auto" instead of "make paper".
 19 | # =============================================================================
 20 | 
 21 | # CONFIGURATION VARIABLES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 22 | 
 23 | # The output PDF file and the bibliography .bib file are based on this name.
 24 | basename  := paper
 25 | 
 26 | # Set the following to the actual DOI for the archive of your software.
 27 | archive_doi := http://dx.doi.org/00.00000/zenodo.0000000
 28 | 
 29 | # The values of the following are irrelevant for a preview, and you can just
 30 | # leave them as-is. If you want to (re)create the PDF using final published
 31 | # values, then go ahead and update these once you know the appropriate values.
 32 | volume	  := 0
 33 | issue	  := 0
 34 | pages	  := 0
 35 | year	  := $(shell date +'%Y')
 36 | submitted := 00 January $(year)
 37 | accepted  := 00 January $(year)
 38 | 
 39 | # End of configuration section. You shouldn't have to change anything below.
 40 | 
 41 | # DEFAULT ACTION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 42 | 
 43 | help:
 44 | 	@printf 'Available commands:\n\n'
 45 | 	@printf 'make\n'
 46 | 	@printf 'make help\n'
 47 | 	@printf '  Print this summary of available commands.\n\n'
 48 | 	@printf 'make setup\n'
 49 | 	@printf '  Download a copy of Whedon and do some other setup work.\n\n'
 50 | 	@printf 'make paper\n'
 51 | 	@printf '  Run pandoc to create a PDF file to preview the paper.\n\n'
 52 | 	@printf 'make auto\n'
 53 | 	@printf '  Watch input files for changes and auto-remake the paper.\n'
 54 | 
 55 | # MAIN CODE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 56 | 
 57 | setup:
 58 | 	git clone --recursive git@github.com:openjournals/whedon.git
 59 | 	@which pandoc > /dev/null || echo \
 60 | 	  "\n\033[31m*** Cannot find pandoc -- please install it ***\033[0m\n"
 61 | 
 62 | input   := $(basename).md
 63 | output  := $(basename).pdf
 64 | bib     := $(basename).bib
 65 | title	:= $(shell grep title: $(input) | sed 's/title: *//' | tr -d "'")
 66 | authors := $(shell sed -n '/authors:/,/affiliations:/p' $(input) | grep name: |\
 67 | 	      sed 's/- name: *//' | paste -d, -s - | sed 's/,/, /g')
 68 | repo	:= $(shell git remote get-url origin |\
 69 | 	      sed 's|git@github.com:|https://github.com/|' | sed 's/\.git//')
 70 | 
 71 | paper $(output): check-have-pandoc check-whedon-exists $(input) $(bib) Makefile
 72 | 	pandoc -s $(input) --verbose \
 73 | 	-V title="$(title)" \
 74 | 	-V paper_title="$(title)" \
 75 | 	-V citation_author="$(authors)" \
 76 | 	-V formatted_doi="10.21105/joss.$(pages)" \
 77 | 	-V review_issue_url="https://github.com/openjournals/joss-reviews/issues/$(issue)" \
 78 | 	-V repository="$(repo)" \
 79 | 	-V archive_doi="$(archive_doi)" \
 80 | 	-V submitted="$(submitted)" \
 81 | 	-V published="$(accepted)" \
 82 | 	-V journal_name="Journal of Open Source Software" \
 83 | 	-V issue="$(issue)" \
 84 | 	-V volume="$(volume)" \
 85 | 	-V year="$(year)" \
 86 | 	-V page="$(pages)" \
 87 | 	-V graphics="true" \
 88 | 	-V logo_path="whedon/resources/joss/logo.png" \
 89 | 	-V geometry:margin=1in \
 90 | 	-o $(output) \
 91 | 	--pdf-engine=xelatex \
 92 | 	--citeproc \
 93 | 	--csl "whedon/resources/apa.csl" \
 94 | 	--template "whedon/resources/latex.template"
 95 | 
 96 | autorefresh auto: check-have-entr
 97 | 	((ls $(input) $(bib) Makefile | entr make $(output)) &)
 98 | 
 99 | # MISCELLANEOUS DIRECTIVES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
100 | 
101 | check-have-pandoc:; @which python > /dev/null
102 | 
103 | check-have-entr:; @which entr > /dev/null
104 | 
105 | check-whedon-exists:; @if [ ! -d whedon ]; then echo \
106 | 	   "\n\033[31m*** Could not find whedon -- run 'make setup' ***\033[0m\n";\
107 | 	    exit 1; fi;
108 | 
109 | .PHONY: check-have-pandoc check-have-entr check-whedon-exists
110 | 


--------------------------------------------------------------------------------
/pubs/joss/README.md:
--------------------------------------------------------------------------------
1 | JOSS paper
2 | ===========
3 | 
4 | This directory contains the source files for a short paper about Handprint in the Journal of Open Source Software.
5 | 


--------------------------------------------------------------------------------
/pubs/joss/notes.txt:
--------------------------------------------------------------------------------
1 | Read this for more good points:
2 | 
3 | https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008390
4 | 


--------------------------------------------------------------------------------
/pubs/joss/paper.bib:
--------------------------------------------------------------------------------
 1 | @misc{AmazonInc.2022amazon,
 2 |   title = {Amazon {{Rekognition}} [{{Computer}} software]},
 3 |   author = {{Amazon, Inc.}},
 4 |   year = {2022},
 5 |   url = {https://aws.amazon.com/rekognition/}
 6 | }
 7 | 
 8 | @misc{AmazonInc.2022amazona,
 9 |   title = {Amazon {{Textract}} [{{Computer}} software]},
10 |   author = {{Amazon, Inc.}},
11 |   year = {2022},
12 |   url = {https://aws.amazon.com/textract/}
13 | }
14 | 
15 | @misc{Wikipediacontributors2012first,
16 |   title = {First {{Computer Bug}}},
17 |   author = {{Wikipedia contributors}},
18 |   year = {2012},
19 |   url = {https://commons.wikimedia.org/wiki/File:First_Computer_Bug,_1945.jpg},
20 |   urldate = {2022-03-28},
21 |   file = {/Users/mhucka/databases/zotero-bibliography/storage/FTVRRABT/Wikipedia contributors 2012 — Web Page — First Computer Bug.jpg}
22 | }
23 | 
24 | @misc{GoogleInc.2022googlea,
25 |   title = {Google {{Cloud Vision API}} [{{Computer}} software]},
26 |   author = {{Google, Inc.}},
27 |   year = {2022},
28 |   url = {https://cloud.google.com/vision/docs/handwriting}
29 | }
30 | 
31 | @misc{MicrosoftInc.2022microsoft,
32 |   title = {Microsoft {{Azure Computer Vision}} [{{Computer}} software]},
33 |   author = {{Microsoft, Inc.}},
34 |   year = {2022},
35 |   url = {https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/}
36 | }
37 | 
38 | @misc{Hucka2022handprint,
39 |   title = {Handprint user manual},
40 |   author = {Hucka, Michael},
41 |   year = {2022},
42 |   url = {https://caltechlibrary.github.io/handprint/}
43 | }
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/pubs/joss/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Handprint: a program to explore and compare major cloud-based services for handwritten text recognition'
 3 | tags:
 4 | - handwritten text recognition
 5 | - optical character recognition
 6 | - machine learning
 7 | authors:
 8 | - name: Michael Hucka
 9 |   orcid: 0000-0001-9105-5960
10 |   affiliation: 1
11 | affiliations:
12 | - name: Caltech Library, California Institute of Technology, Pasadena, CA 91125, USA
13 |   index: 1
14 | date: 5 April 2022
15 | bibliography: paper.bib
16 | ---
17 | 
18 | # Summary
19 | 
20 | Handprint (_**Hand**written **p**age **r**ecognit**i**o**n** **t**est_) is a command-line application that can invoke cloud-based services to perform handwritten text recognition (HTR) on images of documents. It accepts images in various popular formats, sends them to the service providers, gathers the results, and then annotates copies of the images to show the results to the user. It currently supports HTR services from Amazon [@AmazonInc.2022amazon; @AmazonInc.2022amazona], Google [@GoogleInc.2022googlea], and Microsoft [@MicrosoftInc.2022microsoft], but its architecture is modular and could be extended to other services. Handprint is a command-line program written in Python and can run on macOS, Windows, and Linux computers.
21 | 
22 | 
23 | # Statement of need
24 | 
25 | Several of cloud computing companies have developed machine learning-based methods for handwritten text recognition (HTR) and offer them as on-demand services. These network-based services can be applied to images of document pages without the need for training on samples of handwriting. The results are remarkably good overall, but there are differences in quality and features between the different offerings. Comparing the results produced by the competing services is complicated by the fact that they each have unique application programming interfaces (APIs). The purpose of Handprint is to make comparisons simple and easy, without the need for users to learn how to program with the different APIs. With Handprint, users can easily process individual images, directories of images, and URLs pointing to images on remote servers without writing a line of code. If desired, users can also use Handprint in scripts as part of automated workflows.
26 | 
27 | 
28 | # Summary of Handprint usage
29 | 
30 | This section summarizes the user-accessible capabilities provided by Handprint.
31 | 
32 | ## Configuration
33 | 
34 | The only configuration necessary after installation is to run Handprint with a certain command-line option to store the user's account credentials for each cloud-based HTR service provider. The command needs to be run once for each desired provider, and thereafter, Handprint will use the account information automatically. The Handprint documentation at <https://caltechlibrary.github.io/handprint/> explains the simple file format in which the credentials need to be written.
35 | 
36 | 
37 | ## Basic features
38 | 
39 | Handprint can read a number of common image formats: JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF. Image paths or URLs can be supplied to Handprint in any of the following ways: (a) one or more directory paths or one or more image file paths on the local disk, which will be interpreted as images---either individually or in directories---to be processed; (b) one or more URLs, which will be interpreted as network locations of image files to be processed; or (c) if given the `-f` command-line option (`/f` on Windows), a file containing either image paths or image URLs to be processed. When using URLs, Handprint first downloads the image found at the given URL(s) to a directory of the user's choosing on the local disk. No matter whether files or URLs, each item should be a single image of a document page containing text.
40 | 
41 | Handprint's basic features include the ability to display different kinds of bounding boxes, save the full raw results from HTR services as JSON or text files, and use multiple processor threads to speed up processing. For example, using one of the sample images found in Handprint's source directory, the following command,
42 | ```
43 | handprint --text-size 19 --display text,bb-line H96566k.jpg
44 | ```
45 | will send, in parallel, the image file named `H96566K.jpg` to the four services currently supported (Amazon Rekognition, Amazon Textract, Google Cloud Vision, and Microsoft Azure Computer Vision). The output will be a file named `H96566k.handprint-all.png` with the contents shown in the figure below.
46 | 
47 | ![**Figure 1**: Example of output from Handprint using default settings.(Source image obtained from Wikipedia [@Wikipediacontributors2012first].)](H96566k.handprint-all.png)
48 | 
49 | Users can also select a subset of services to use, and can opt to skip the creation of the overview grid image if they only need the other types of outputs that Handprint can produce.
50 | 
51 | 
52 | ## Advanced features
53 | 
54 | Handprint also includes additional, more advanced features. One is the ability to filter the displayed results by confidence scores, allowing users to see which words or other components have confidence values that meet or exceed a chosen threshold. Another is a facility to compare text results to expected (ground truth) text. The comparison algorithm has some novel capabilities, notably in how it can treat missing, extra, or transposed lines of text from the HTR results (a common difference between the outputs of different services).
55 | 
56 | 
57 | # Documentation
58 | 
59 | A detailed user manual is available as a GitHub Pages website at <https://caltechlibrary.github.io/handprint/>. Handprint also prints usage information to the terminal when given the command-line option `--help`.
60 | 
61 | 
62 | # Acknowledgments
63 | 
64 | The development of Handprint was supported by the Caltech Library. Handprint benefitted from feedback from several people, notably Tommy Keswick, Mariella Soprano, Peter Collopy and Stephen Davison of the Caltech Library. The [vector artwork](https://thenounproject.com/search/?q=hand&i=733265) of a hand used as a logo for Handprint was created by [Kevin](https://thenounproject.com/kevn/) for the [Noun Project](https://thenounproject.com).
65 | 
66 | 
67 | # References
68 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # @file    requirements.txt
 3 | # @brief   Python dependencies for Handprint for development
 4 | # @created 2022-03-14
 5 | # @license Please see the file named LICENSE in the project directory
 6 | # @website https://github.com/caltechlibrary/handprint
 7 | # =============================================================================
 8 | 
 9 | -r requirements.txt
10 | 
11 | pytest-mock == 3.7.0
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # @file    requirements.txt
 3 | # @brief   Python dependencies for Handprint for normal use
 4 | # @created 2018-10-23
 5 | # @license Please see the file named LICENSE in the project directory
 6 | # @website https://github.com/caltechlibrary/handprint
 7 | # =============================================================================
 8 | 
 9 | aenum                    == 3.1.0
10 | appdirs                  == 1.4.4
11 | boltons                  == 21.0.0
12 | boto3                    == 1.17.91
13 | bun                      == 0.0.8
14 | commonpy                 >= 1.9.1
15 | fastnumbers              == 3.1.0
16 | google-api-core          == 1.30.0
17 | google-api-python-client == 2.8.0
18 | google-auth              == 1.30.2
19 | google-auth-httplib2     == 0.1.0
20 | google-cloud             == 0.34.0
21 | google-cloud-vision      == 2.3.1
22 | googleapis-common-protos == 1.53.0
23 | grpcio                   == 1.44.0
24 | humanize                 >= 3.7.1
25 | imagesize                == 1.2.0
26 | matplotlib               == 3.4.2
27 | numpy                    == 1.22.2
28 | Pillow                   == 9.0.1
29 | plac                     == 1.3.4
30 | psutil                   == 5.8.0
31 | PyMuPDF                  == 1.19.6
32 | requests                 == 2.25.0
33 | rich                     == 12.0.1
34 | setuptools               >= 62.1.0
35 | sidetrack                == 2.0.0
36 | StringDist               == 1.0.9
37 | textdistance             == 4.2.2
38 | urllib3                  == 1.26.5
39 | validator-collection     == 1.5.0
40 | wheel                    == 0.36.2
41 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # @file    setup.cfg
 3 | # @brief   Package metadata and PyPI configuration
 4 | # @author  Michael Hucka <mhucka@caltech.edu>
 5 | # @license Please see the file named LICENSE in the project directory
 6 | # @website https://github.com/caltechlibrary/handprint
 7 | # =============================================================================
 8 | 
 9 | [metadata]
10 | name          = handprint
11 | version       = 1.6.0
12 | description   = Run handwritten text recognition services on images of documents
13 | author        = Michael Hucka
14 | author_email  = mhucka@caltech.edu
15 | license       = BSD 3-clause
16 | license_files = LICENSE
17 | url           = https://github.com/caltechlibrary/handprint
18 | # The remaining items below are used by PyPI.
19 | project_urls  =
20 |   Source Code = https://github.com/caltechlibrary/handprint
21 |   Bug Tracker = https://github.com/caltechlibrary/handprint/issues
22 | keywords      = HTR, OCR, handwritten text recognition, machine learning
23 | classifiers   =
24 |   Development Status :: 5 - Production/Stable
25 |   Environment :: Console
26 |   License :: OSI Approved :: BSD License
27 |   Intended Audience :: Science/Research
28 |   Operating System :: MacOS :: MacOS X
29 |   Operating System :: Microsoft :: Windows
30 |   Operating System :: POSIX
31 |   Operating System :: POSIX :: Linux
32 |   Operating System :: Unix
33 |   Programming Language :: Python
34 |   Programming Language :: Python :: 3.8
35 |   Topic :: Scientific/Engineering :: Artificial Intelligence
36 |   Topic :: Scientific/Engineering :: Image Recognition
37 |   Topic :: Text Processing :: Linguistic
38 | long_description = file:README.md
39 | long_description_content_type = text/markdown
40 | 
41 | [options]
42 | packages = find:
43 | zip_safe = False
44 | python_requires = >= 3.8
45 | 
46 | [options.entry_points]
47 | console_scripts = 
48 |   handprint = handprint.__main__:console_scripts_main
49 | 
50 | [check-manifest]
51 | ignore =
52 |     .graphics/sample-annotated-image.png
53 |     .graphics/handprint-screencast.gif
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # =============================================================================
 3 | # @file    setup.py
 4 | # @brief   Installation setup file
 5 | # @author  Michael Hucka <mhucka@caltech.edu>
 6 | # @license Please see the file named LICENSE in the project directory
 7 | # @website https://github.com/caltechlibrary/handprint
 8 | #
 9 | # Note: configuration metadata is maintained in setup.cfg.  This file exists
10 | # primarily to hook in setup.cfg and requirements.txt.
11 | #
12 | # =============================================================================
13 | 
14 | import os
15 | from   os.path import exists, join, abspath, dirname
16 | from   setuptools import setup
17 | 
18 | here = abspath(dirname(__file__))
19 | 
20 | requirements = []
21 | if exists(join(here, 'requirements.txt')):
22 |     with open(join(here, 'requirements.txt')) as f:
23 |         requirements = f.read().rstrip().splitlines()
24 | 
25 | setup(
26 |     setup_requires = ['wheel'],
27 |     install_requires = requirements,
28 | )
29 | 


--------------------------------------------------------------------------------
/tests/data/README.md:
--------------------------------------------------------------------------------
1 | Test images
2 | ===========
3 | 
4 | This directory contains a small number of test images from various sources.  Note that this is _not_ intended as a complete set of test cases; it is only present for demonstration and testing of Handprint.
5 | 
6 | For a larger collection of test cases, please see the [htr-test-cases](https://github.com/caltechlibrary/htr-test-cases) repository that we also make available on GitHub.
7 | 


--------------------------------------------------------------------------------
/tests/data/bad-images/corrupted-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/bad-images/corrupted-image.png


--------------------------------------------------------------------------------
/tests/data/caltech-archives/caltech-archives-urls.txt:
--------------------------------------------------------------------------------
1 | https://digital.archives.caltech.edu/adore-djatoka//resolver?rft_id=https%3A%2F%2Fdigital.archives.caltech.edu%2Fislandora%2Fobject%2Fhale%253A41656%2Fdatastream%2FJP2%2Fview%3Ftoken%3D1c8de85be701a28063d1fbe7b6f4c3849d95c0cabbb79cbe8eb077daf219a43d&url_ver=Z39.88-2004&svc_id=info%3Alanl-repo%2Fsvc%2FgetRegion&svc_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajpeg2000&svc.format=image%2Fjpeg&svc.level=4&svc.rotate=0
2 | https://digital.archives.caltech.edu/adore-djatoka//resolver?rft_id=https%3A%2F%2Fdigital.archives.caltech.edu%2Fislandora%2Fobject%2Fhale%253A44634%2Fdatastream%2FJP2%2Fview%3Ftoken%3D1b9e848d4f1ddae21fde96820b6f6592521cb7588b3e54bb468762a61be1ded2&url_ver=Z39.88-2004&svc_id=info%3Alanl-repo%2Fsvc%2FgetRegion&svc_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajpeg2000&svc.format=image%2Fjpeg&svc.level=4&svc.rotate=0
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/data/caltech-archives/glaser/DAG_5_1_6 1952-1957 Notebook VI p2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/caltech-archives/glaser/DAG_5_1_6 1952-1957 Notebook VI p2.jpg


--------------------------------------------------------------------------------
/tests/data/caltech-archives/glaser/DAG_5_2_1 1950-1953 notebook VIII p7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/caltech-archives/glaser/DAG_5_2_1 1950-1953 notebook VIII p7.jpg


--------------------------------------------------------------------------------
/tests/data/caltech-archives/glaser/README.md:
--------------------------------------------------------------------------------
 1 | Origin and conditions governing use
 2 | ===================================
 3 | 
 4 | The images in this sample came from the Caltech Archives' [Donald A. Glaser Digital Collection](http://glaser.library.caltech.edu).
 5 | 
 6 | Here are some notes about specific documents:
 7 | 
 8 | * <a href="DAG_5_2_1 1950-1953 notebook VIII p7.jpg">DAG_5_2_1 1950-1953 notebook VIII p7.jpg</a>: Notebook VIII, dated 1950 May - 1953 November. Part of: Donald A Glaser Papers. Series 5: Bubble Chamber (Ann Arbor); Subseries 1: Notebooks; Box 2, Folder 1
 9 | 
10 | * <a href="DAG_5_1_6 1952-1957 Notebook VI p2.jpg">DAG_5_1_6 1952-1957 Notebook VI p2.jpg</a>: Notebook VI - High Energy Physics Colloquium (21 items), ca. 1952-1957, 1991. Part of: Donald A Glaser Papers. Series 5: Bubble Chamber (Ann Arbor); Subseries 1: Notebooks; Box 1, Folder 6.
11 | 


--------------------------------------------------------------------------------
/tests/data/caltech-archives/glaser/dag-285-DAG_1_1_8_0003.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/caltech-archives/glaser/dag-285-DAG_1_1_8_0003.jp2


--------------------------------------------------------------------------------
/tests/data/caltech-archives/glaser/dag-304-DAG_1_1_8_0029.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/caltech-archives/glaser/dag-304-DAG_1_1_8_0029.jp2


--------------------------------------------------------------------------------
/tests/data/fragments/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/fragments/f1.png


--------------------------------------------------------------------------------
/tests/data/fragments/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/fragments/f2.png


--------------------------------------------------------------------------------
/tests/data/fragments/f6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/fragments/f6.png


--------------------------------------------------------------------------------
/tests/data/fragments/f7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/fragments/f7.png


--------------------------------------------------------------------------------
/tests/data/public-domain/LOC-urls.txt:
--------------------------------------------------------------------------------
1 | https://tile.loc.gov/storage-services/master/mss/mss41335/006/0400/0441.tif
2 | https://tile.loc.gov/storage-services/master/mss/mss41335/006/0400/0442.tif
3 | 


--------------------------------------------------------------------------------
/tests/data/public-domain/images/AGBell_Notebook.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/public-domain/images/AGBell_Notebook.jpg


--------------------------------------------------------------------------------
/tests/data/public-domain/images/H96566k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/public-domain/images/H96566k.jpg


--------------------------------------------------------------------------------
/tests/data/public-domain/images/README.md:
--------------------------------------------------------------------------------
 1 | Origin and conditions governing use
 2 | ===================================
 3 | 
 4 | The following are the sources of the images in this directory.
 5 | 
 6 | * [H96566k.jpg](https://upload.wikimedia.org/wikipedia/commons/8/8a/H96566k.jpg) &ndash; Courtesy of the Naval Surface Warfare Center, Dahlgren, VA., 1988. [Public domain], via Wikimedia Commons.  Picture of a notebook page showing the first "Computer Bug" Moth found trapped between points in a relay of the Mark II Aiken Relay Calculator while it was being tested at Harvard University, 9 September 1947. The operators affixed the moth to the computer log, with the entry: "First actual case of bug being found".
 7 | 
 8 | * [AGBell_Notebook.jpg](https://upload.wikimedia.org/wikipedia/commons/0/0c/AGBell_Notebook.jpg) &ndash; pages 40-1 of Alexander Graham Bell's unpublished laboratory notebook (1875-76), describing first successful experiment with the telephone.
 9 | 
10 | * [mabel-h-bell-to-eliza-s-bell-sept-28-1879-1-1600.jpg](mabel-h-bell-to-eliza-s-bell-sept-28-1879-1-1600.jpg) &ndash; downloaded from [Picryl](https://picryl.com/media/letter-from-mabel-hubbard-bell-to-eliza-symonds-bell-september-28-1879-1) on 2019-08-11. Original source is given as Library of Congress.  No copyright restrictions.
11 | 
12 | * [clara-barton-life-of-my-childhood-p90.jpg](clara-barton-life-of-my-childhood-p90.jpg) &ndash; downloaded from [Pycryl](https://picryl.com/media/clara-barton-papers-speeches-and-writings-file-1849-1947-books-the-life-of-71) on 2019-08-11.  Original source is given as Library of Congress.  No copyright restrictions.
13 | 


--------------------------------------------------------------------------------
/tests/data/public-domain/images/clara-barton-life-of-my-childhood-p90.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/public-domain/images/clara-barton-life-of-my-childhood-p90.jpg


--------------------------------------------------------------------------------
/tests/data/public-domain/images/mabel-h-bell-to-eliza-s-bell-sept-28-1879-1-1600.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caltechlibrary/handprint/1b84db7eeb40dfaa5e4ced9b2cfdb021c1a1ba68/tests/data/public-domain/images/mabel-h-bell-to-eliza-s-bell-sept-28-1879-1-1600.jpg


--------------------------------------------------------------------------------
/tests/test_comparison.py:
--------------------------------------------------------------------------------
 1 | from   commonpy.file_utils import delete_existing
 2 | import os
 3 | import os.path
 4 | import pytest
 5 | import sys
 6 | import tempfile
 7 | from   time import time
 8 | 
 9 | try:
10 |     thisdir = os.path.dirname(os.path.abspath(__file__))
11 |     sys.path.append(os.path.join(thisdir, '..'))
12 | except:
13 |     sys.path.append('..')
14 | 
15 | from handprint.comparison import *
16 | 
17 | 
18 | def test_text_comparison():
19 |     x = text_comparison('a', 'a')
20 |     assert x == 'Errors\tCER (%)\tExpected text\tReceived text\n0\t0.00\ta\ta\nTotal errors\t\t\t\n0\t\t\t'
21 |     x = text_comparison('a', 'b')
22 |     assert x == 'Errors\tCER (%)\tExpected text\tReceived text\n1\t100.00\tb\t\nTotal errors\t\t\t\n1\t\t\t'
23 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | from   contextlib import redirect_stdout
 2 | import io
 3 | import os
 4 | import plac
 5 | import pytest
 6 | import sys
 7 | from   time import time
 8 | 
 9 | try:
10 |     thisdir = os.path.dirname(os.path.abspath(__file__))
11 |     sys.path.append(os.path.join(thisdir, '..'))
12 | except:
13 |     sys.path.append('..')
14 | 
15 | from handprint.__main__ import main
16 | from handprint.exceptions import *
17 | from handprint.exit_codes import ExitCode
18 | 
19 | def test_exceptions():
20 |     try:
21 |         raise InternalError('foo')
22 |     except Exception as ex:
23 |         assert isinstance(ex, HandprintException)
24 |         assert str(ex) == 'foo'
25 | 
26 | 
27 | def test_bad_cli_arg():
28 |     with pytest.raises(SystemExit) as ex_info:
29 |         assert plac.call(main, ['-s', 'bogus'])
30 | 
31 |     assert ex_info.type == SystemExit
32 |     assert ex_info.value.code == int(ExitCode.bad_arg)
33 | 
34 | 
35 | def test_cli_arg_version():
36 |     output = io.StringIO()
37 |     with pytest.raises(SystemExit) as ex_info:
38 |         with redirect_stdout(output):
39 |             plac.call(main, ['-V'])
40 | 
41 |     assert output.getvalue().startswith('handprint version')
42 |     assert ex_info.type == SystemExit
43 |     assert ex_info.value.code == int(ExitCode.success)
44 | 
45 | 
46 | def test_cli_arg_services():
47 |     output = io.StringIO()
48 |     with pytest.raises(SystemExit) as ex_info:
49 |         with redirect_stdout(output):
50 |             plac.call(main, ['-l'])
51 | 
52 |     assert output.getvalue().startswith('Known services')
53 |     assert ex_info.type == SystemExit
54 |     assert ex_info.value.code == int(ExitCode.success)
55 | 


--------------------------------------------------------------------------------
/tests/test_exit_codes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import sys
 4 | from   time import time
 5 | 
 6 | try:
 7 |     thisdir = os.path.dirname(os.path.abspath(__file__))
 8 |     sys.path.append(os.path.join(thisdir, '..'))
 9 | except:
10 |     sys.path.append('..')
11 | 
12 | from handprint.exit_codes import *
13 | 
14 | def test_exceptions():
15 |     assert int(ExitCode.success) == 0
16 |     assert ExitCode.success.meaning == "success -- program completed normally"
17 | 


--------------------------------------------------------------------------------
/tests/test_images.py:
--------------------------------------------------------------------------------
 1 | from   commonpy.file_utils import delete_existing
 2 | import os
 3 | import os.path
 4 | import pytest
 5 | import sys
 6 | import tempfile
 7 | from   time import time
 8 | 
 9 | try:
10 |     thisdir = os.path.dirname(os.path.abspath(__file__))
11 |     sys.path.append(os.path.join(thisdir, '..'))
12 | except:
13 |     sys.path.append('..')
14 | 
15 | from handprint.images import *
16 | 
17 | 
18 | def test_format_name():
19 |     assert canonical_format_name('jpeg') == 'jpeg'
20 |     assert canonical_format_name('jpg') == 'jpeg'
21 |     assert canonical_format_name('TIF') == 'tiff'
22 | 
23 | 
24 | def test_image_size():
25 |     thisdir = path.dirname(os.path.abspath(__file__))
26 |     assert image_size(path.join(thisdir, 'data', 'fragments', 'f1.png')) == 15553
27 |     assert image_size(path.join(thisdir, 'data', 'fragments', 'f2.png')) == 8613
28 | 
29 | 
30 | def test_image_dimensions():
31 |     thisdir = path.dirname(os.path.abspath(__file__))
32 |     assert image_dimensions(path.join(thisdir, 'data', 'fragments', 'f1.png')) == (340, 106)
33 |     assert image_dimensions(path.join(thisdir, 'data', 'fragments', 'f2.png')) == (228, 60)
34 | 
35 | 
36 | def test_reduced_image_size():
37 |     _, tmpfile = tempfile.mkstemp(dir = '/tmp', suffix = '.png')
38 |     thisdir = path.dirname(os.path.abspath(__file__))
39 |     f1_file = path.join(thisdir, 'data', 'fragments', 'f1.png')
40 |     (a, b) = reduced_image_size(f1_file, tmpfile, 1000)
41 |     assert isinstance(a, str)
42 |     assert b is None
43 |     assert image_dimensions(tmpfile) == (22, 7)
44 |     delete_existing(tmpfile)
45 | 
46 | 
47 | def test_reduced_image_dimensions():
48 |     _, tmpfile = tempfile.mkstemp(dir = '/tmp', suffix = '.png')
49 |     thisdir = path.dirname(os.path.abspath(__file__))
50 |     f1_file = path.join(thisdir, 'data', 'fragments', 'f1.png')
51 |     (a, b) = reduced_image_dimensions(f1_file, tmpfile, 100, 100)
52 |     assert isinstance(a, str)
53 |     assert b is None
54 |     assert image_dimensions(tmpfile) == (100, 31)
55 |     delete_existing(tmpfile)
56 | 
57 | 
58 | def test_converted_image():
59 |     _, tmpfile = tempfile.mkstemp(dir = '/tmp', suffix = '.tiff')
60 |     thisdir = path.dirname(os.path.abspath(__file__))
61 |     f1_file = path.join(thisdir, 'data', 'fragments', 'f1.png')
62 |     (a, b) = converted_image(f1_file, 'tif', tmpfile)
63 |     assert isinstance(a, str)
64 |     assert b is None
65 |     delete_existing(tmpfile)
66 | 


--------------------------------------------------------------------------------