├── .circleci ├── config.for-lab.yml └── config.yml ├── .dockerignore ├── .gitattributes ├── .gitignore ├── .pylintrc ├── Dockerfile ├── README.md ├── admin ├── endpoint_tester │ ├── app.py │ ├── endpoints.txt │ ├── images.txt │ ├── inv_images.txt │ ├── make_paths_txt.py │ ├── remote_images.txt │ └── run_siege.sh ├── handwriting_data_gathering │ ├── output_markdown.py │ ├── print.pdf │ ├── run.sh │ └── template.md ├── outstanding_tasks.md ├── readme.md ├── tasks │ ├── lab_specific_files.yml │ ├── print_repo_structure.sh │ ├── subset_repo_for_labs.py │ └── subset_repo_for_labs.sh └── wandb_hub │ ├── Dockerfile │ └── readme.md ├── api ├── Dockerfile ├── __init__.py ├── app.py ├── requirements.txt └── tests │ └── test_app.py ├── data └── raw │ ├── emnist │ ├── metadata.toml │ └── readme.md │ ├── fsdl_handwriting │ ├── fsdl_handwriting.json │ ├── metadata.toml │ └── readme.md │ └── iam │ ├── metadata.toml │ └── readme.md ├── environment.yml ├── evaluation ├── evaluate_character_predictor.py └── evaluate_line_predictor.py ├── instructions ├── editor.md ├── lab1.md ├── lab2.md ├── lab3.md ├── lab4.md ├── lab5.md ├── lab6.md ├── lab7.md ├── lab8.md ├── lab8_notes.md ├── lab9.md ├── lab9_aws_and_monitoring.md ├── project_structure.md ├── readme.md ├── setup.md └── setup_extra.md ├── notebooks ├── 01-look-at-emnist.ipynb ├── 01b-cnn-for-emnist.ipynb ├── 02-look-at-emnist-lines.ipynb ├── 02b-cnn-for-simple-emnist-lines.ipynb ├── 03-look-at-iam-lines.ipynb ├── 04-look-at-iam-paragraphs.ipynb ├── 04b-look-at-line-detector-predictions.ipynb ├── 05-look-at-fsdl-handwriting.ipynb ├── archive │ ├── 00-download-emnist.ipynb │ ├── 02-train-emnist-mlp.ipynb │ ├── 04-line-cnn.ipynb │ ├── 05-line-lstm.ipynb │ ├── 06-line-lstm-with-ctc.ipynb │ ├── xx-all-in-one.ipynb │ ├── xx-ctc-loss.ipynb │ ├── xx-fc-vs-1x1.ipynb │ └── xx-re-pad-iam-lines.ipynb ├── future_work │ └── 03b-generate-iam-lines.ipynb └── line detection experiments │ ├── 0-iam-pages.ipynb │ ├── 4-experimenting-with-line-prediction-on-synthetic-data.ipynb │ ├── 5-predicting-iam-line-locations.ipynb │ ├── 5b-prediction-mse.ipynb │ ├── 5c-prediction-just-y-coordinates.ipynb │ ├── 5d-predicting-iam-line-locations-dual-channel-unet.ipynb │ └── paragraph_text_recognizer_debug.ipynb ├── pyproject.toml ├── requirements-dev.in ├── requirements-dev.txt ├── requirements.in ├── requirements.txt ├── setup.cfg ├── tasks ├── build_api_docker.sh ├── clean.sh ├── format.sh ├── lint.sh ├── prepare_sample_experiments.sh ├── run_api_docker.sh ├── sync_requirements.sh ├── test_api.sh ├── test_functionality.sh ├── test_validation.sh ├── train_character_predictor.sh ├── train_line_detector.sh ├── train_lstm_line_predictor.sh ├── train_lstm_line_predictor_on_iam.sh ├── update_fsdl_paragraphs_metadata.sh └── update_requirements.sh ├── text_recognizer ├── __init__.py ├── character_predictor.py ├── datasets │ ├── __init__.py │ ├── dataset.py │ ├── dataset_sequence.py │ ├── emnist_dataset.py │ ├── emnist_essentials.json │ ├── emnist_lines_dataset.py │ ├── fsdl_handwriting_dataset.py │ ├── iam_dataset.py │ ├── iam_lines_dataset.py │ ├── iam_paragraphs_dataset.py │ └── sentence_generator.py ├── line_predictor.py ├── models │ ├── __init__.py │ ├── base.py │ ├── character_model.py │ ├── line_detector_model.py │ ├── line_model.py │ └── line_model_ctc.py ├── networks │ ├── __init__.py │ ├── ctc.py │ ├── fcn.py │ ├── lenet.py │ ├── line_cnn_all_conv.py │ ├── line_lstm_ctc.py │ ├── misc.py │ └── mlp.py ├── paragraph_text_recognizer.py ├── tests │ ├── __init__.py │ ├── support │ │ ├── create_emnist_lines_support_files.py │ │ ├── create_emnist_support_files.py │ │ ├── create_iam_lines_support_files.py │ │ ├── emnist │ │ │ ├── 8.png │ │ │ ├── U.png │ │ │ └── e.png │ │ ├── emnist_lines │ │ │ ├── Corsi left for.png │ │ │ ├── do that In.png │ │ │ └── or if used the results.png │ │ ├── iam_lines │ │ │ ├── He rose from his breakfast-nook bench.png │ │ │ ├── and came into the livingroom, where.png │ │ │ └── his entrance. He came, almost falling.png │ │ └── iam_paragraphs │ │ │ └── a01-000u-cropped.jpg │ ├── test_character_predictor.py │ ├── test_line_predictor.py │ └── test_paragraph_text_recognizer.py ├── util.py └── weights │ ├── CharacterModel_EmnistDataset_mlp_weights.h5 │ ├── LineDetectorModel_IamParagraphsDataset_fcn_weights.h5 │ ├── LineModelCtc_EmnistLinesDataset_line_lstm_ctc_weights.h5 │ └── LineModelCtc_IamLinesDataset_line_lstm_ctc_weights.h5 ├── training ├── __init__.py ├── experiments │ ├── cnn.json │ ├── lstm_ctc.json │ └── sample.json ├── gpu_manager.py ├── prepare_experiments.py ├── run_experiment.py ├── run_sweep.py ├── sweep_emnist.yaml ├── sweep_iam.yaml ├── update_metadata.py └── util.py └── wandb └── settings /.circleci/config.for-lab.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | - image: circleci/python:3.7 10 | 11 | steps: 12 | - checkout 13 | 14 | - restore_cache: 15 | keys: 16 | - cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }} 17 | 18 | - run: 19 | name: Install Git LFS 20 | command: | 21 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 22 | sudo apt-get install git-lfs 23 | git lfs install 24 | 25 | - run: 26 | name: Pull LFS Files 27 | command: git lfs pull 28 | 29 | - run: 30 | name: Install Shellcheck 31 | command: | 32 | curl -OL https://storage.googleapis.com/shellcheck/shellcheck-stable.linux.x86_64.tar.xz 33 | tar xf shellcheck-stable.linux.x86_64.tar.xz 34 | sudo mv shellcheck-stable/shellcheck /usr/local/bin 35 | working_directory: /tmp/shellcheck 36 | 37 | - run: 38 | name: install dependencies 39 | command: | 40 | sed -i 's/tensorflow==/tensorflow-cpu==/' requirements.txt 41 | pip install -r requirements.txt 42 | pip install -r requirements-dev.txt 43 | 44 | - save_cache: 45 | key: cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }} 46 | paths: 47 | - ~/.local 48 | 49 | - run: 50 | name: run linting 51 | when: always 52 | command: | 53 | cd lab8 && PYTHONPATH=. ./tasks/lint.sh 54 | 55 | - run: 56 | name: run prediction tests 57 | when: always 58 | command: | 59 | cd lab8 && PYTHONPATH=. pytest -s text_recognizer/tests/* 60 | 61 | - run: 62 | name: run evaluation tests 63 | command: | 64 | cd lab8 && PYTHONPATH=. pytest -s evaluation/* 65 | 66 | - store_artifacts: 67 | path: test-reports 68 | destination: test-reports 69 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | - image: circleci/python:3.7 10 | 11 | steps: 12 | - checkout 13 | 14 | - restore_cache: 15 | keys: 16 | - cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }} 17 | 18 | - run: 19 | name: Install Git LFS 20 | command: | 21 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 22 | sudo apt-get install git-lfs 23 | git lfs install 24 | 25 | - run: 26 | name: Pull LFS Files 27 | command: git lfs pull 28 | 29 | - run: 30 | name: Install Shellcheck 31 | command: | 32 | curl -OL https://storage.googleapis.com/shellcheck/shellcheck-stable.linux.x86_64.tar.xz 33 | tar xf shellcheck-stable.linux.x86_64.tar.xz 34 | sudo mv shellcheck-stable/shellcheck /usr/local/bin 35 | working_directory: /tmp/shellcheck 36 | 37 | - run: 38 | name: install dependencies 39 | command: | 40 | sed -i 's/tensorflow==/tensorflow-cpu==/' requirements.txt 41 | pip install -r requirements.txt 42 | pip install -r requirements-dev.txt 43 | 44 | - save_cache: 45 | key: cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }} 46 | paths: 47 | - ~/.local 48 | 49 | - run: 50 | name: run linting 51 | when: always 52 | command: | 53 | PYTHONPATH=. ./tasks/lint.sh 54 | 55 | - run: 56 | name: run prediction tests 57 | when: always 58 | command: | 59 | PYTHONPATH=. pytest -s text_recognizer/tests/* 60 | 61 | - run: 62 | name: run evaluation tests 63 | command: | 64 | PYTHONPATH=. pytest -s evaluation/* 65 | 66 | - store_artifacts: 67 | path: test-reports 68 | destination: test-reports 69 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !api 3 | !text_recognizer 4 | !requirements* 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.png filter=lfs diff=lfs merge=lfs -text 2 | *.jpg filter=lfs diff=lfs merge=lfs -text 3 | *.h5 filter=lfs diff=lfs merge=lfs -text 4 | data/**/*.json filter=lfs diff=lfs merge=lfs -text 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Data 2 | data/processed 3 | data/interim 4 | data/raw/emnist/matlab* 5 | data/raw/fsdl_handwriting/pages 6 | data/raw/iam/iamdb 7 | data/raw/iam/iamdb.zip 8 | data/raw/nltk 9 | 10 | # Editors 11 | .vscode 12 | 13 | # Node 14 | node_modules 15 | 16 | # Python 17 | __pycache__ 18 | .pytest_cache 19 | .ipynb_checkpoints 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # W&B 40 | wandb-debug.log 41 | wandb/* 42 | !wandb/settings 43 | 44 | # Misc 45 | .DS_Store 46 | _labs 47 | logs 48 | .mypy_cache 49 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | 3 | # Disable the message(s) with the given id(s). 4 | # E1130 - invalid-unary-operand-type false positive https://github.com/PyCQA/pylint/issues/1472 5 | # E1136 - unsubscriptable (unsubscriptable-object) - Pylint is failing to infer correct type from astroid https://github.com/PyCQA/pylint/issues/2849 6 | # R0801 - similar lines across files 7 | # W0511 - TODO comments 8 | # W1202 - logging-format-interpolation - Behavior barring fstrings in logging https://github.com/PyCQA/pylint/issues/2395 9 | # missing-function-dosctring: docstyle handles 10 | # bad-continuation: disagrees with black formatter 11 | disable=E1130,E1136,R0801,W0511,W1202,missing-function-docstring,bad-continuation 12 | # LAST AUDITED: 2019-01-09 13 | 14 | [MASTER] 15 | 16 | # A comma-separated list of package or module names from where C extensions may 17 | # be loaded. Extensions are loading into the active Python interpreter and may 18 | # run arbitrary code 19 | extension-pkg-whitelist=numpy 20 | 21 | [TYPECHECK] 22 | 23 | # List of module names for which member attributes should not be checked 24 | # (useful for modules/projects where namespaces are manipulated during runtime 25 | # and thus existing member attributes cannot be deduced by static analysis. It 26 | # supports qualified module names, as well as Unix pattern matching. 27 | ignored-modules=cv2,numpy,tensorflow 28 | 29 | # List of classes names for which member attributes should not be checked 30 | # (useful for classes with attributes dynamically set). This supports can work 31 | # with qualified names. 32 | ignored-classes=cv2,numpy,tensorflow 33 | 34 | [BASIC] 35 | 36 | # Good variable names which should always be accepted, separated by a comma 37 | good-names = _, e, f, fn, i, j, k, n, N, m, M, D, p, t, v, x, X, y, Y, w, h, W, H, x1, x2, y1, y2, ax, df 38 | 39 | # Regular expression which should only match correct function names 40 | function-rgx=[a-z_][a-z0-9_]{2,70}$ 41 | 42 | # Regular expression which should only match correct method names 43 | method-rgx=[a-z_][a-z0-9_]{2,70}$ 44 | 45 | [FORMAT] 46 | 47 | # Maximum number of characters on a single line. 48 | max-line-length = 120 49 | 50 | [DESIGN] 51 | # Minimum number of public methods for a class (see R0903). 52 | min-public-methods = 0 53 | 54 | # Maximum number of attributes for a class (see R0902). 55 | max-attributes = 15 56 | 57 | max-locals = 18 58 | 59 | max-args = 8 60 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN apt-get update 4 | RUN apt-get install -y python3-pip 5 | RUN pip3 install --upgrade pip 6 | RUN pip install -r requirements.txt 7 | RUN pip install -r requirements-dev.txt 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FSDL Text Recognizer Project 2 | 3 | This is the admin version of the Full Stack Deep Learning project. 4 | 5 | The instructions that students will see start in [Lab 1 Instructions](instructions/lab1.md). 6 | 7 | To create the `fsdl-text-recognizer-project` directory, with files subsetted into labs and lab solutions, run `admin/tasks/subset_repo_for_labs.sh`. 8 | -------------------------------------------------------------------------------- /admin/endpoint_tester/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import base64 4 | import glob 5 | 6 | import grequests 7 | 8 | NUM_CALLS = 500 # per each HTTP method 9 | TIMEOUT = 2.0 10 | LOCAL_IMAGE_GLOB = "../text_recognizer/tests/support/emnist/*.png" 11 | ENDPOINTS_FILE = "./endpoints.txt" 12 | IMAGE_URLS_FILE = "./remote_images.txt" 13 | 14 | 15 | def url_for_get(api_url, img_url): 16 | """Returns a url suitable for testing GET.""" 17 | return "%s?image_url=%s" % (api_url.strip("/"), img_url) 18 | 19 | 20 | def data_for_post(api_url, img_path): 21 | """Returns data param for testing POST.""" 22 | with open(img_path, "rb") as f: 23 | text = base64.b64encode(f.read()).decode("ascii") 24 | return {"image": "data:image/png;base64,'%s'" % text} 25 | 26 | 27 | def build_get_calls(api_url, img_urls): 28 | """Returns frozen GET calls.""" 29 | return [grequests.get(url_for_get(api_url, img_url), timeout=TIMEOUT) for img_url in img_urls] 30 | 31 | 32 | def build_post_calls(api_url, local_images): 33 | """Returns frozen POST calls.""" 34 | return [ 35 | grequests.post(api_url, data=data_for_post(api_url, img_path), timeout=TIMEOUT) for img_path in local_images 36 | ] 37 | 38 | 39 | def main(): 40 | """Reads the files and runs everything.""" 41 | with open(ENDPOINTS_FILE) as endpoints_file: 42 | endpoints = [x.strip() for x in endpoints_file.readlines()] 43 | with open(IMAGE_URLS_FILE) as image_urls_file: 44 | remote_image_urls = [x.strip() for x in image_urls_file.readlines()] 45 | local_images = glob.glob(LOCAL_IMAGE_GLOB) 46 | 47 | # build set of roughly 200 calls 48 | stuff = [] 49 | for url in endpoints: 50 | stuff.extend(build_get_calls(url, remote_image_urls)) 51 | stuff.extend(build_post_calls(url, local_images)) 52 | stuff *= int(200 / len(stuff)) 53 | 54 | good = 0 55 | total = 0 56 | while True: 57 | responses = grequests.map(stuff) 58 | total += len(stuff) 59 | good += len(stuff) - responses.count(None) 60 | b = "%s of %s completed." % (good, total) 61 | print(b, end="\r") 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /admin/endpoint_tester/images.txt: -------------------------------------------------------------------------------- 1 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png 2 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Cousin%2BElecs.png 3 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Vic%2Btheater%2BSaturday%2Bafternoon.png 4 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Yesterday%2Bit%2Boffered.png 5 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/and%2Bthe%2Bpiston%2Bis%2Bin%2Btop.png 6 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/distributions.png 7 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/do%2Bthat%2BIn.png 8 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/far%2Bas%2Bto%2Bsay.png 9 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/force%2Brequirements%2Band%2Bas%2Bit.png 10 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/he%2Bhad%2Blittle%2Btolerance%2Bfor%2BWhigs.png 11 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/of%2B120degrees%2B160degreesF%2B490.png 12 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png 13 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/right%2BYour.png 14 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/secretaries.png 15 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the%2Babsence%2Bof%2Bthe%2Bhymen%2Bis%2Bby%2Bno.png 16 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the.png 17 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/to%2Bthe%2Bmarket.png 18 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/velocity%2Bis%2Bknown%2BCook%2Band.png 19 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/what.png 20 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/who.png 21 | -------------------------------------------------------------------------------- /admin/endpoint_tester/inv_images.txt: -------------------------------------------------------------------------------- 1 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Corsi%2Bleft%2Bfor_inv.png 2 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Cousin%2BElecs_inv.png 3 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Vic%2Btheater%2BSaturday%2Bafternoon_inv.png 4 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Yesterday%2Bit%2Boffered_inv.png 5 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/and%2Bthe%2Bpiston%2Bis%2Bin%2Btop_inv.png 6 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/distributions_inv.png 7 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/do%2Bthat%2BIn_inv.png 8 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/far%2Bas%2Bto%2Bsay_inv.png 9 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/force%2Brequirements%2Band%2Bas%2Bit_inv.png 10 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/he%2Bhad%2Blittle%2Btolerance%2Bfor%2BWhigs_inv.png 11 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/of%2B120degrees%2B160degreesF%2B490_inv.png 12 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults_inv.png 13 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/right%2BYour_inv.png 14 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/secretaries_inv.png 15 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the%2Babsence%2Bof%2Bthe%2Bhymen%2Bis%2Bby%2Bno_inv.png 16 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the_inv.png 17 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/to%2Bthe%2Bmarket_inv.png 18 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/velocity%2Bis%2Bknown%2BCook%2Band_inv.png 19 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/what_inv.png 20 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/who_inv.png 21 | -------------------------------------------------------------------------------- /admin/endpoint_tester/make_paths_txt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | with open("endpoints.txt") as endpoints_file: 4 | endpoints = [x.strip() for x in endpoints_file.readlines()] 5 | with open(sys.argv[1]) as image_urls_file: 6 | remote_image_urls = [x.strip() for x in image_urls_file.readlines()] 7 | 8 | paths = [] 9 | for endpoint in endpoints: 10 | for rem in remote_image_urls: 11 | s = "{0}/v1/predict?image_url={1}".format(endpoint, rem) 12 | paths.append(s) 13 | print(s) 14 | -------------------------------------------------------------------------------- /admin/endpoint_tester/remote_images.txt: -------------------------------------------------------------------------------- 1 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png 2 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Corsi%20left%20for_inv.png 3 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Cousin%20Elecs.png 4 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Cousin%20Elecs_inv.png 5 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Vic%20theater%20Saturday%20afternoon.png 6 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Vic%20theater%20Saturday%20afternoon_inv.png 7 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Yesterday%20it%20offered.png 8 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/Yesterday%20it%20offered_inv.png 9 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/and%20the%20piston%20is%20in%20top.png 10 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/and%20the%20piston%20is%20in%20top_inv.png 11 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/distributions.png 12 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/distributions_inv.png 13 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/do%20that%20In.png 14 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/do%20that%20In_inv.png 15 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/far%20as%20to%20say.png 16 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/far%20as%20to%20say_inv.png 17 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/force%20requirements%20and%20as%20it.png 18 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/force%20requirements%20and%20as%20it_inv.png 19 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/he%20had%20little%20tolerance%20for%20Whigs.png 20 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/he%20had%20little%20tolerance%20for%20Whigs_inv.png 21 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/of%20120degrees%20160degreesF%20490.png 22 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/of%20120degrees%20160degreesF%20490_inv.png 23 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%20if%20used%20the%20results.png 24 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%20if%20used%20the%20results_inv.png 25 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/right%20Your.png 26 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/right%20Your_inv.png 27 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/secretaries.png 28 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/secretaries_inv.png 29 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the%20absence%20of%20the%20hymen%20is%20by%20no.png 30 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the%20absence%20of%20the%20hymen%20is%20by%20no_inv.png 31 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the.png 32 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/the_inv.png 33 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/to%20the%20market.png 34 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/to%20the%20market_inv.png 35 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/velocity%20is%20known%20Cook%20and.png 36 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/velocity%20is%20known%20Cook%20and_inv.png 37 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/what.png 38 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/what_inv.png 39 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/who.png 40 | http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/who_inv.png 41 | -------------------------------------------------------------------------------- /admin/endpoint_tester/run_siege.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | siege -c150 -d1 -i -f paths.txt 3 | -------------------------------------------------------------------------------- /admin/handwriting_data_gathering/print.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/the-full-stack/fsdl-text-recognizer/a99a3d3f0594dfceb249a56e8362337f9e12897e/admin/handwriting_data_gathering/print.pdf -------------------------------------------------------------------------------- /admin/handwriting_data_gathering/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "outputting markdown..." 4 | mkdir -p mds 5 | python output_markdown.py 6 | 7 | echo "converting to pdfs..." 8 | mkdir -p pdfs 9 | for i in {0..13}; do pandoc "mds/$i.md" -o "pdfs/$i.pdf"; done 10 | 11 | pdfunite pdfs/*.pdf print.pdf 12 | rm -r mds 13 | rm -r pdfs 14 | echo "print.pdf is ready to be printed!" 15 | -------------------------------------------------------------------------------- /admin/handwriting_data_gathering/template.md: -------------------------------------------------------------------------------- 1 | --- 2 | geometry: margin=1in 3 | output: pdf_document 4 | header-includes: | 5 | \definecolor{light-gray}{gray}{0.8} 6 | --- 7 | 8 | # Full Stack Deep Learning - November 2019 9 | 10 | ## Handwriting Data Collection 11 | 12 | Please write the following paragraph by hand in the space below. Do not sign your name. 13 | 14 | > {{ text }} 15 | 16 | --- 17 | 18 | 23 | 24 | \vfill 25 | 26 | --- 27 | 28 | By submitting this page, you consent to your handwriting becoming part of a publicly available dataset. 29 | 30 | Paragraph is from {{ source }} 31 | -------------------------------------------------------------------------------- /admin/outstanding_tasks.md: -------------------------------------------------------------------------------- 1 | ## Next 2 | 3 | - [ ] 1 go through all labs in Jupyterhub and take screenshots, putting them into the readme 4 | - [ ] 1 make the app.py use the joint model 5 | 6 | - [ ] 2 add more information to slides as preview of the important things we'll be doing 7 | - [ ] ability to run end-to-end from raw data, with caching along the way to speed up future runs 8 | - [ ] dataset streaming and augmentations (fast.ai, TFRecord) 9 | - [ ] specifying and recording experiments via config file 10 | - [ ] ability to run experiments and automatically pick best model 11 | - [ ] ability to create a deployment package in CI 12 | - [ ] 2 introduce code that picks best run from weights and biases (2 hours) 13 | - [ ] 2 add multi-gpu data parallelism option in run_experiment.py 14 | - [ ] 2 look into switching from flask to that async one in fast.ai course 15 | - [ ] 2 kick off another IAM training with ImageDataGenerator 16 | - [ ] 2 add tests for training (but don't run them in circleci) 17 | - [ ] 2 add to lab 4: output sample predictions every epoch so that they can be reviewed in weights and biases 18 | - [ ] 2 save experiment json along with weights, and just call it canonical_character_predictor_weights.h5 and canonical_character_predictor_config.json 19 | - easiest way to implement would probably be to pass in experiment_config from run_experiment to Model#save_weights 20 | 21 | - [ ] 3 add a notebook that uses our trained line detector on the fsdl handwriting data 22 | - [ ] 3 share pre-processing logic in predict() and fit()/evaluate() 23 | - [ ] 3 compute validation accuracy in ctc training (run decoding) 24 | 25 | - [ ] 4 make a flag for overfitting on one batch 26 | 27 | ## Done 28 | 29 | - [x] 20191029 look into writing lab readme's as slides using Marp, but decided against it for now, because wasn't able to find a solution that looked good in both github readme format (and typora) and marp 30 | - [x] 20191030 1 update Pipfile 31 | - tensorflow 1.15 seems to depend on functools32 which can't be installed for python3 32 | - tensorflow 1.14 has the dual -gpu and not-gpu nature, which is a little annoying, but fine 33 | - tensorflow 2.0 also has dual gpu 34 | - python3.7 has trouble installing a dependency of wandb (forgot the name) 35 | - settled on python3.6 and tensorflow 1.14 36 | -------------------------------------------------------------------------------- /admin/readme.md: -------------------------------------------------------------------------------- 1 | # Text Recognizer Project - Admin Readme 2 | 3 | ## Tasks 4 | 5 | ```sh 6 | admin/tasks/subset_repo_for_labs.py # Creates -in _labs by default 7 | 8 | admin/tasks/subset_repo_for_labs.sh # Creates in ../fsdl-text-recognition-project, which should be the public git repo 9 | ``` 10 | 11 | Uploading data to S3 is done with `aws s3 cp data/raw/iam/iamdb.zip s3://fsdl-public-assets/iam/iamdb.zip --profile fsdl` 12 | -------------------------------------------------------------------------------- /admin/tasks/lab_specific_files.yml: -------------------------------------------------------------------------------- 1 | 1: 2 | - notebooks/01-look-at-emnist.ipynb 3 | - tasks/train_character_predictor.sh 4 | - tasks/test_functionality.sh 5 | - text_recognizer/__init__.py 6 | - text_recognizer/util.py 7 | - text_recognizer/character_predictor.py 8 | - text_recognizer/datasets/__init__.py 9 | - text_recognizer/datasets/dataset.py 10 | - text_recognizer/datasets/emnist_dataset.py 11 | - text_recognizer/datasets/emnist_essentials.json 12 | - text_recognizer/datasets/dataset_sequence.py 13 | - text_recognizer/models/__init__.py 14 | - text_recognizer/models/base.py 15 | - text_recognizer/models/character_model.py 16 | - text_recognizer/networks/__init__.py 17 | - text_recognizer/networks/mlp.py 18 | - text_recognizer/tests/support/create_emnist_support_files.py 19 | - text_recognizer/tests/support/emnist/8.png 20 | - text_recognizer/tests/support/emnist/U.png 21 | - text_recognizer/tests/support/emnist/e.png 22 | - text_recognizer/weights/CharacterModel_EmnistDataset_mlp_weights.h5 23 | - text_recognizer/tests/test_character_predictor.py 24 | - training/util.py 25 | - training/run_experiment.py 26 | - text_recognizer/networks/lenet.py 27 | - text_recognizer/networks/misc.py 28 | - text_recognizer/tests/support/create_emnist_lines_support_files.py 29 | - "text_recognizer/tests/support/emnist_lines/Corsi left for.png" 30 | - "text_recognizer/tests/support/emnist_lines/do that In.png" 31 | - "text_recognizer/tests/support/emnist_lines/or if used the results.png" 32 | 33 | 2: 34 | - notebooks/02-look-at-emnist-lines.ipynb 35 | - notebooks/01b-cnn-for-emnist.ipynb 36 | - notebooks/02b-cnn-for-simple-emnist-lines.ipynb 37 | - text_recognizer/datasets/emnist_lines_dataset.py 38 | - text_recognizer/datasets/sentence_generator.py 39 | - text_recognizer/line_predictor.py 40 | - text_recognizer/models/line_model.py 41 | - text_recognizer/networks/line_cnn_all_conv.py 42 | 43 | 3: 44 | - tasks/train_lstm_line_predictor.sh 45 | - text_recognizer/models/line_model_ctc.py 46 | - text_recognizer/networks/ctc.py 47 | - text_recognizer/networks/line_lstm_ctc.py 48 | - text_recognizer/weights/LineModelCtc_EmnistLinesDataset_line_lstm_ctc_weights.h5 49 | - text_recognizer/tests/test_line_predictor.py 50 | 51 | 4: 52 | - "text_recognizer/tests/support/iam_lines/and came into the livingroom, where.png" 53 | - "text_recognizer/tests/support/iam_lines/He rose from his breakfast-nook bench.png" 54 | - "text_recognizer/tests/support/iam_lines/his entrance. He came, almost falling.png" 55 | - notebooks/03-look-at-iam-lines.ipynb 56 | - tasks/prepare_sample_experiments.sh 57 | - tasks/train_lstm_line_predictor_on_iam.sh 58 | - text_recognizer/datasets/iam_lines_dataset.py 59 | - text_recognizer/tests/support/create_iam_lines_support_files.py 60 | - text_recognizer/weights/LineModelCtc_IamLinesDataset_line_lstm_ctc_weights.h5 61 | - training/experiments/sample.json 62 | - training/gpu_manager.py 63 | - training/prepare_experiments.py 64 | - training/run_sweep.py 65 | - training/sweep_emnist.yaml 66 | - training/sweep_iam.yaml 67 | - wandb/settings 68 | 69 | 5: 70 | - notebooks/04-look-at-iam-paragraphs.ipynb 71 | - notebooks/04b-look-at-line-detector-predictions.ipynb 72 | - tasks/train_line_detector.sh 73 | - text_recognizer/datasets/iam_dataset.py 74 | - text_recognizer/datasets/iam_paragraphs_dataset.py 75 | - text_recognizer/models/line_detector_model.py 76 | - text_recognizer/networks/fcn.py 77 | - text_recognizer/paragraph_text_recognizer.py 78 | - text_recognizer/tests/support/iam_paragraphs/a01-000u-cropped.jpg 79 | - text_recognizer/tests/test_paragraph_text_recognizer.py 80 | - text_recognizer/weights/LineDetectorModel_IamParagraphsDataset_fcn_weights.h5 81 | 82 | 6: 83 | - notebooks/05-look-at-fsdl-handwriting.ipynb 84 | - tasks/update_fsdl_paragraphs_metadata.sh 85 | - text_recognizer/datasets/fsdl_handwriting_dataset.py 86 | - training/update_metadata.py 87 | 88 | 7: 89 | - api/__init__.py 90 | - evaluation/evaluate_character_predictor.py 91 | - evaluation/evaluate_line_predictor.py 92 | - tasks/lint.sh 93 | - tasks/test_validation.sh 94 | - .pylintrc 95 | - pyproject.toml 96 | - setup.cfg 97 | 98 | 8: 99 | - api/app.py 100 | - api/tests/test_app.py 101 | - api/Dockerfile 102 | - tasks/build_api_docker.sh 103 | - tasks/run_api_docker.sh 104 | - tasks/test_api.sh 105 | -------------------------------------------------------------------------------- /admin/tasks/print_repo_structure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tree -L 3 -I "run-*|node_modules|admin" 4 | -------------------------------------------------------------------------------- /admin/tasks/subset_repo_for_labs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script to generate directories (or git branches) corresponding to subsets of the repo appropriate for different labs. 4 | 5 | The script creates a subset of files corresponding to labs with index less than or equal than the one given, 6 | as specified in lab_specific_files.yml 7 | 8 | Furthermore, it also strips out text between blocks like 9 | # Your code below (Lab1) 10 | # 11 | # Your code above (Lab1) 12 | for labs with index greater than or equal to the one given. 13 | 14 | It also strips text between blocks like 15 | # Hide lines below until Lab 2 16 | # 17 | # Hide lines above until Lab 2 18 | for labs with index greater than the one given. 19 | 20 | NOTE that the stripping is only performed on .py files. 21 | """ 22 | from pathlib import Path 23 | import argparse 24 | import os 25 | import glob 26 | import re 27 | import shutil 28 | 29 | import yaml 30 | 31 | MAX_LAB_NUMBER = 9 # NOTE: Setting this to 10 will break the regexp! 32 | REPO_DIRNAME = Path(__file__).resolve().parents[2] 33 | INFO_FILENAME = REPO_DIRNAME / "admin" / "tasks" / "lab_specific_files.yml" 34 | SOLUTION_VERSION_LABS = True 35 | 36 | 37 | def _filter_your_code_blocks(lines, lab_number): 38 | """ 39 | Strip out stuff between "Your code here" blocks. 40 | """ 41 | if lab_number == MAX_LAB_NUMBER: 42 | lab_numbers_to_strip = str(lab_number) 43 | else: 44 | lab_numbers_to_strip = f"[{'|'.join(str(num) for num in range(lab_number, MAX_LAB_NUMBER))}]" 45 | beginning_comment = f"# Your code below \(Lab {lab_numbers_to_strip}\)" 46 | ending_comment = f"# Your code above \(Lab {lab_numbers_to_strip}\)" 47 | filtered_lines = [] 48 | filtering = False 49 | for line in lines: 50 | if not filtering: 51 | filtered_lines.append(line) 52 | if re.search(beginning_comment, line): 53 | filtering = True 54 | filtered_lines.append("") 55 | if re.search(ending_comment, line): 56 | filtered_lines.append(line) 57 | filtering = False 58 | return filtered_lines 59 | 60 | 61 | def _filter_hidden_blocks(lines, lab_number): 62 | if lab_number == MAX_LAB_NUMBER: 63 | return lines 64 | if lab_number + 1 == MAX_LAB_NUMBER: 65 | lab_numbers_to_hide = str(MAX_LAB_NUMBER) 66 | else: 67 | lab_numbers_to_hide = f"[{'|'.join(str(num) for num in range(lab_number + 1, MAX_LAB_NUMBER))}]" 68 | beginning_comment = f"# Hide lines below until Lab {lab_numbers_to_hide}" 69 | ending_comment = f"# Hide lines above until Lab {lab_numbers_to_hide}" 70 | filtered_lines = [] 71 | filtering = False 72 | for line in lines: 73 | if re.search(beginning_comment, line): 74 | filtering = True 75 | if re.search(ending_comment, line): 76 | filtering = False 77 | continue 78 | if not filtering: 79 | filtered_lines.append(line) 80 | return filtered_lines 81 | 82 | 83 | def _replace_data_dirname(lines): 84 | filtered_lines = [] 85 | for line in lines: 86 | if line == ' return Path(__file__).resolve().parents[2] / "data"': 87 | line = ' return Path(__file__).resolve().parents[3] / "data"' 88 | filtered_lines.append(line) 89 | return filtered_lines 90 | 91 | 92 | def _copy_files_for_lab(info, lab_number, lab_output_dir): 93 | selected_paths = sum([info.get(number, []) for number in range(lab_number + 1)], []) 94 | new_paths = [] 95 | for path in selected_paths: 96 | new_path = lab_output_dir / path 97 | new_path.parents[0].mkdir(parents=True, exist_ok=True) 98 | shutil.copy(path, new_path) 99 | new_paths.append(new_path) 100 | return new_paths 101 | 102 | 103 | def _process_new_files(new_paths, lab_number, filter_your_code=True, filter_hidden=True, replace_data_dirname=True): 104 | for path in new_paths: 105 | if path.suffix != ".py": 106 | continue 107 | 108 | with open(path) as f: 109 | lines = f.read().split("\n") 110 | 111 | if filter_your_code: 112 | lines = _filter_your_code_blocks(lines, lab_number) 113 | if filter_hidden: 114 | lines = _filter_hidden_blocks(lines, lab_number) 115 | if replace_data_dirname: 116 | lines = _replace_data_dirname(lines) 117 | 118 | with open(path, "w") as f: 119 | f.write("\n".join(lines)) 120 | 121 | 122 | def subset_repo(info, output_dirname): 123 | """See module docstring.""" 124 | output_dir = Path(output_dirname) 125 | if output_dir.exists(): 126 | for directory in glob.glob(f"{str(output_dir)}/lab*"): 127 | shutil.rmtree(directory) 128 | if os.path.exists(output_dir / "data"): 129 | shutil.rmtree(output_dir / "data") 130 | 131 | output_dir.mkdir(parents=True, exist_ok=True) 132 | shutil.copytree(REPO_DIRNAME / "data", output_dir / "data") 133 | 134 | shutil.copy(".gitignore", output_dir) 135 | shutil.copy("environment.yml", output_dir) 136 | shutil.copy("requirements.in", output_dir) 137 | shutil.copy("requirements-dev.in", output_dir) 138 | shutil.copy("requirements.txt", output_dir) 139 | shutil.copy("requirements-dev.txt", output_dir) 140 | shutil.copy("instructions/readme.md", output_dir) 141 | shutil.copy("instructions/setup.md", output_dir) 142 | 143 | # Labs 144 | for lab_number in info.keys(): 145 | lab_output_dir = output_dir / f"lab{lab_number}" 146 | lab_output_dir.mkdir(parents=True) 147 | new_paths = _copy_files_for_lab(info, lab_number, lab_output_dir) 148 | _process_new_files(new_paths, lab_number, filter_your_code=(not SOLUTION_VERSION_LABS)) 149 | shutil.copy(f"instructions/lab{lab_number}.md", output_dir / f"lab{lab_number}" / "readme.md") 150 | 151 | (output_dir / ".circleci").mkdir(exist_ok=True) 152 | shutil.copy(".circleci/config.for-lab.yml", output_dir / ".circleci" / "config.yml") 153 | 154 | if not SOLUTION_VERSION_LABS: 155 | os.remove(output_dir / "lab1/text_recognizer/weights/CharacterModel_EmnistDataset_mlp_weights.h5") 156 | os.remove(output_dir / "lab2/text_recognizer/weights/LineModelCtc_EmnistLinesDataset_line_lstm_ctc_weights.h5") 157 | os.remove(output_dir / "lab4/text_recognizer/weights/LineModelCtc_IamLinesDataset_line_lstm_ctc_weights.h5") 158 | os.remove(output_dir / "lab5/text_recognizer/weights/LineDetectorModel_IamParagraphsDataset_fcn_weights.h5") 159 | 160 | 161 | def main(): 162 | parser = argparse.ArgumentParser() 163 | parser.add_argument("--output_dirname", default="_labs", help="Where to output the lab subset directories.") 164 | args = parser.parse_args() 165 | 166 | with open(INFO_FILENAME) as f: 167 | info = yaml.full_load(f.read()) 168 | 169 | subset_repo(info, args.output_dirname) 170 | 171 | 172 | if __name__ == "__main__": 173 | main() 174 | -------------------------------------------------------------------------------- /admin/tasks/subset_repo_for_labs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python admin/tasks/subset_repo_for_labs.py --output_dir ../fsdl-text-recognizer-project 4 | -------------------------------------------------------------------------------- /admin/wandb_hub/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # Install basics 4 | RUN apt-get update 5 | RUN apt-get install -y build-essential # For being able to compile bottleneck and some other Python package 6 | RUN apt-get install -y curl # For downloading conda 7 | RUN apt-get install -y git # We need git! 8 | RUN apt-get install -y shellcheck # For linting 9 | 10 | # Install git-lfs 11 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && apt-get install git-lfs && git-lfs install 12 | 13 | # Install miniconda 14 | RUN cd /tmp && curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -p ~/miniconda -b && rm Miniconda3-latest-Linux-x86_64.sh && cd - 15 | ENV PATH=$HOME/miniconda/bin:$PATH 16 | 17 | # Install base conda packages to speed things up 18 | RUN conda update -y conda 19 | RUN conda install cudatoolkit=10.1 cudnn=7.6 pip python=3.7 20 | 21 | # Install some heavy packages in the default conda environment simply to cache these packages and save time later 22 | RUN pip install tensorflow==2.2.0rc2 torch 23 | -------------------------------------------------------------------------------- /admin/wandb_hub/readme.md: -------------------------------------------------------------------------------- 1 | ## July 31 Sync with Chris re: W&B Jupyter Hub 2 | 3 | - How many CPUs and how much RAM per container? 4 | - shows 8 cores, 32 GB 5 | - limit pod to 8GB, 2CPUs, 2GPUs 6 | - Is it possible to do 2 GPUs per container? 7 | - It is possible. They're running 2 GPUs per docker right now. 8 | - Persistent space 9 | - Chris will turn it on 10 | - 10GB 11 | - What should be mounted 12 | - next docker build will start in home directory 13 | - can set env variable to clone a repo other than ml-class 14 | - [ ] send chris repo to clone 15 | - Github access 16 | - should store username and password 17 | - Admin (see other sessions, etc)? 18 | - done 19 | - Troubleshooting (how to handle frozen sessions, for example)? 20 | - right-click in terminal, click Refresh Terminal 21 | - if people get "invalid code" messages, have them sign up for wandb again 22 | - [ ] send chris the schedule to pre-launch the cluster 23 | - Is it possible to run Docker inside of container? (can the container run privileged?) 24 | 25 | https://hub.wandb.us/hub/login 26 | https://hub.wandb.us/hub/login?gpu=true 27 | 28 | ## Things that should be set in environment 29 | 30 | ```sh 31 | export CUDA_DEVICE_ORDER=PCI_BUS_ID 32 | export PYTHONPATH=. 33 | alias ll="ls -lh" 34 | ``` 35 | -------------------------------------------------------------------------------- /api/Dockerfile: -------------------------------------------------------------------------------- 1 | # The "buster" flavor of the official docker Python image is based on Debian and includes common packages. 2 | FROM python:3.7-buster 3 | 4 | # Create the working directory 5 | RUN set -ex && mkdir /repo 6 | WORKDIR /repo 7 | 8 | # Copy only the relevant directories to the working diretory 9 | COPY text_recognizer/ ./text_recognizer 10 | COPY api/ ./api 11 | 12 | # Install Python dependencies 13 | RUN set -ex && pip3 install -r api/requirements.txt 14 | 15 | # Run the web server 16 | EXPOSE 8000 17 | ENV PYTHONPATH /repo 18 | CMD python3 /repo/api/app.py 19 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/the-full-stack/fsdl-text-recognizer/a99a3d3f0594dfceb249a56e8362337f9e12897e/api/__init__.py -------------------------------------------------------------------------------- /api/app.py: -------------------------------------------------------------------------------- 1 | """Flask web server serving text_recognizer predictions.""" 2 | import os 3 | 4 | from flask import Flask, request, jsonify 5 | import tensorflow.keras.backend as K 6 | 7 | from text_recognizer.line_predictor import LinePredictor 8 | import text_recognizer.util as util 9 | 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "" # Do not use GPU 11 | 12 | app = Flask(__name__) # pylint: disable=invalid-name 13 | 14 | 15 | @app.route("/") 16 | def index(): 17 | """Provide simple health check route.""" 18 | return "Hello, world!" 19 | 20 | 21 | @app.route("/v1/predict", methods=["GET", "POST"]) 22 | def predict(): 23 | """Provide main prediction API route. Responds to both GET and POST requests.""" 24 | K.clear_session() 25 | predictor = LinePredictor() 26 | image = _load_image() 27 | pred, conf = predictor.predict(image) 28 | print("METRIC confidence {}".format(conf)) 29 | print("METRIC mean_intensity {}".format(image.mean())) 30 | print("INFO pred {}".format(pred)) 31 | return jsonify({"pred": str(pred), "conf": float(conf)}) 32 | 33 | 34 | def _load_image(): 35 | if request.method == "POST": 36 | data = request.get_json() 37 | if data is None: 38 | return "no json received" 39 | return util.read_b64_image(data["image"], grayscale=True) 40 | if request.method == "GET": 41 | image_url = request.args.get("image_url") 42 | if image_url is None: 43 | return "no image_url defined in query string" 44 | print("INFO url {}".format(image_url)) 45 | return util.read_image(image_url, grayscale=True) 46 | raise ValueError("Unsupported HTTP method") 47 | 48 | 49 | def main(): 50 | """Run the app.""" 51 | app.run(host="0.0.0.0", port=8000, debug=False) # nosec 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | absl-py==0.9.0 # via tensorboard, tensorflow 8 | astunparse==1.6.3 # via tensorflow 9 | boltons==20.0.0 # via -r requirements.in 10 | cachetools==4.0.0 # via google-auth 11 | certifi==2019.11.28 # via requests 12 | chardet==3.0.4 # via requests 13 | click==7.1.1 # via flask 14 | editdistance==0.5.3 # via -r requirements.in 15 | flask==1.1.1 # via -r requirements.in 16 | gast==0.3.3 # via tensorflow 17 | google-auth-oauthlib==0.4.1 # via tensorboard 18 | google-auth==1.12.0 # via google-auth-oauthlib, tensorboard 19 | google-pasta==0.2.0 # via tensorflow 20 | grpcio==1.27.2 # via tensorboard, tensorflow 21 | h5py==2.10.0 # via -r requirements.in, tensorflow 22 | idna==2.9 # via requests 23 | itsdangerous==1.1.0 # via flask 24 | jinja2==2.11.1 # via flask 25 | keras-preprocessing==1.1.0 # via tensorflow 26 | markdown==3.2.1 # via tensorboard 27 | markupsafe==1.1.1 # via jinja2 28 | numpy==1.18.2 # via -r requirements.in, h5py, keras-preprocessing, opencv-python-headless, opt-einsum, scipy, tensorboard, tensorflow 29 | oauthlib==3.1.0 # via requests-oauthlib 30 | opencv-python-headless==4.2.0.32 # via -r requirements.in 31 | opt-einsum==3.2.0 # via tensorflow 32 | protobuf==3.11.3 # via tensorboard, tensorflow 33 | pyasn1-modules==0.2.8 # via google-auth 34 | pyasn1==0.4.8 # via pyasn1-modules, rsa 35 | requests-oauthlib==1.3.0 # via google-auth-oauthlib 36 | requests==2.23.0 # via -r requirements.in, requests-oauthlib, tensorboard 37 | rsa==4.0 # via google-auth 38 | scipy==1.4.1 # via tensorflow 39 | six==1.14.0 # via absl-py, astunparse, google-auth, google-pasta, grpcio, h5py, keras-preprocessing, protobuf, tensorboard, tensorflow 40 | tensorboard-plugin-wit==1.6.0.post2 # via tensorboard 41 | tensorboard==2.2.0 # via tensorflow 42 | tensorflow-estimator==2.2.0rc0 # via tensorflow 43 | tensorflow-cpu==2.2.0rc2 # via -r requirements.in 44 | termcolor==1.1.0 # via tensorflow 45 | toml==0.10.0 # via -r requirements.in 46 | tqdm==4.44.1 # via -r requirements.in 47 | urllib3==1.25.8 # via requests 48 | werkzeug==1.0.0 # via flask, tensorboard 49 | wheel==0.34.2 # via astunparse, tensorboard, tensorflow 50 | wrapt==1.11.2 # via -r requirements.in, tensorflow 51 | 52 | # The following packages are considered to be unsafe in a requirements file: 53 | # setuptools 54 | -------------------------------------------------------------------------------- /api/tests/test_app.py: -------------------------------------------------------------------------------- 1 | """Tests for web app.""" 2 | import os 3 | from pathlib import Path 4 | from unittest import TestCase 5 | import base64 6 | 7 | from api.app import app 8 | 9 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 10 | 11 | REPO_DIRNAME = Path(__file__).parents[2].resolve() 12 | # SUPPORT_DIRNAME = REPO_DIRNAME / 'text_recognizer' / 'tests' / 'support' / 'iam_lines' 13 | SUPPORT_DIRNAME = REPO_DIRNAME / "text_recognizer" / "tests" / "support" / "emnist_lines" 14 | 15 | 16 | class TestIntegrations(TestCase): 17 | def setUp(self): 18 | self.app = app.test_client() 19 | 20 | def test_index(self): 21 | response = self.app.get("/") 22 | assert response.get_data().decode() == "Hello, world!" 23 | 24 | def test_predict(self): 25 | with open(SUPPORT_DIRNAME / "or if used the results.png", "rb") as f: 26 | b64_image = base64.b64encode(f.read()) 27 | response = self.app.post("/v1/predict", json={"image": f"data:image/jpeg;base64,{b64_image.decode()}"}) 28 | json_data = response.get_json() 29 | self.assertEqual(json_data["pred"], "or if used the resuits") 30 | -------------------------------------------------------------------------------- /data/raw/emnist/metadata.toml: -------------------------------------------------------------------------------- 1 | filename = 'matlab.zip' 2 | sha256 = 'e1fa805cdeae699a52da0b77c2db17f6feb77eed125f9b45c022e7990444df95' 3 | url = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/matlab.zip' 4 | -------------------------------------------------------------------------------- /data/raw/emnist/readme.md: -------------------------------------------------------------------------------- 1 | # EMNIST dataset 2 | 3 | The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19 4 | and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset." 5 | From https://www.nist.gov/itl/iad/image-group/emnist-dataset 6 | 7 | Original url is http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip 8 | 9 | We uploaded the same file to our S3 bucket for faster download. 10 | -------------------------------------------------------------------------------- /data/raw/fsdl_handwriting/fsdl_handwriting.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:720d6c72b4317a9a5492630a1c9f6d83a20d36101a29311a5cf7825c1d60c180 3 | size 170325 4 | -------------------------------------------------------------------------------- /data/raw/fsdl_handwriting/metadata.toml: -------------------------------------------------------------------------------- 1 | url = "https://dataturks.com/projects/sergeykarayev/fsdl_handwriting/export" 2 | filename = "fsdl_handwriting.json" 3 | sha256 = "720d6c72b4317a9a5492630a1c9f6d83a20d36101a29311a5cf7825c1d60c180" 4 | -------------------------------------------------------------------------------- /data/raw/fsdl_handwriting/readme.md: -------------------------------------------------------------------------------- 1 | # FSDL Handwriting Dataset 2 | 3 | Handwritten paragraphs generated in the FSDL March 2019 class and annotated using the DataTurks UX. 4 | 5 | Export via manual download on https://dataturks.com/projects/sergeykarayev/fsdl_handwriting/export 6 | -------------------------------------------------------------------------------- /data/raw/iam/metadata.toml: -------------------------------------------------------------------------------- 1 | url = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam/iamdb.zip' 2 | filename = 'iamdb.zip' 3 | sha256 = 'f3c9e87a88a313e557c6d3548ed8a2a1af2dc3c4a678c5f3fc6f972ba4a50c55' 4 | -------------------------------------------------------------------------------- /data/raw/iam/readme.md: -------------------------------------------------------------------------------- 1 | # IAM Dataset 2 | 3 | The IAM Handwriting Database contains forms of handwritten English text which can be used to train and test handwritten text recognizers and to perform writer identification and verification experiments. 4 | 5 | - 657 writers contributed samples of their handwriting 6 | - 1,539 pages of scanned text 7 | - 13,353 isolated and labeled text lines 8 | 9 | - http://www.fki.inf.unibe.ch/databases/iam-handwriting-database 10 | 11 | ## Pre-processing 12 | 13 | First, all forms were placed into one directory called `forms`, from original directories like `formsA-D`. 14 | 15 | To save space, I converted the original PNG files to JPG, and resized them to half-size 16 | ``` 17 | mkdir forms-resized 18 | cd forms 19 | ls -1 *.png | parallel --eta -j 6 convert '{}' -adaptive-resize 50% '../forms-resized/{.}.jpg' 20 | ``` 21 | 22 | ## Split 23 | 24 | The data split we will use is 25 | IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines. 26 | 27 | - The validation set has been merged into the train set. 28 | - The train set has 7,101 lines from 326 writers. 29 | - The test set has 1,861 lines from 128 writers. 30 | - The text lines of all data sets are mutually exclusive, thus each writer has contributed to one set only. 31 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: fsdl-text-recognizer 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.7 6 | - cudatoolkit=10.1 7 | - cudnn=7.6 8 | - pip 9 | - pip: 10 | - pip-tools 11 | -------------------------------------------------------------------------------- /evaluation/evaluate_character_predictor.py: -------------------------------------------------------------------------------- 1 | """Run validation test for CharacterPredictor.""" 2 | import os 3 | from pathlib import Path 4 | from time import time 5 | import unittest 6 | 7 | from text_recognizer.datasets import EmnistDataset 8 | from text_recognizer.character_predictor import CharacterPredictor 9 | 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 11 | 12 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" / "emnist" 13 | 14 | 15 | class TestEvaluateCharacterPredictor(unittest.TestCase): 16 | def test_evaluate(self): 17 | predictor = CharacterPredictor() 18 | dataset = EmnistDataset() 19 | dataset.load_or_generate_data() 20 | t = time() 21 | metric = predictor.evaluate(dataset) 22 | time_taken = time() - t 23 | print(f"acc: {metric}, time_taken: {time_taken}") 24 | self.assertGreater(metric, 0.6) 25 | self.assertLess(time_taken, 10) 26 | -------------------------------------------------------------------------------- /evaluation/evaluate_line_predictor.py: -------------------------------------------------------------------------------- 1 | """Run validation test for LinePredictor.""" 2 | import os 3 | from pathlib import Path 4 | from time import time 5 | import unittest 6 | 7 | from text_recognizer.datasets import EmnistLinesDataset 8 | from text_recognizer.datasets import IamLinesDataset 9 | from text_recognizer.line_predictor import LinePredictor 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 12 | 13 | EMNIST_SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" / "emnist_lines" 14 | IAM_SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" / "iam_lines" 15 | 16 | 17 | class TestEvaluateLinePredictorEmnist(unittest.TestCase): 18 | def test_evaluate(self): 19 | predictor = LinePredictor(EmnistLinesDataset) 20 | dataset = EmnistLinesDataset() 21 | 22 | dataset.load_or_generate_data() 23 | 24 | t = time() 25 | metric = predictor.evaluate(dataset) 26 | time_taken = time() - t 27 | 28 | print(f"acc: {metric}, time_taken: {time_taken}") 29 | self.assertGreater(metric, 0.6) 30 | self.assertLess(time_taken, 120) 31 | 32 | 33 | class TestEvaluateLinePredictorIam(unittest.TestCase): 34 | def test_evaluate(self): 35 | predictor = LinePredictor(IamLinesDataset) 36 | dataset = IamLinesDataset() 37 | 38 | dataset.load_or_generate_data() 39 | 40 | t = time() 41 | metric = predictor.evaluate(dataset) 42 | time_taken = time() - t 43 | 44 | print(f"acc: {metric}, time_taken: {time_taken}") 45 | self.assertGreater(metric, 0.6) 46 | self.assertLess(time_taken, 180) 47 | -------------------------------------------------------------------------------- /instructions/editor.md: -------------------------------------------------------------------------------- 1 | # Setting up editor 2 | 3 | ## VSCode 4 | 5 | There are two things you want to make sure of when using VSCode: 1) that it uses the right environment, and 2) that it lints your files as you work. 6 | 7 | Here is my setup for linting: 8 | 9 | ``` 10 | { 11 | "editor.rulers": [120], 12 | "files.exclude": { 13 | "**/.git": true, 14 | "**/.DS_Store": true, 15 | "**/__pycache__": true, 16 | "**/.pytest_cache": true, 17 | "**/.mypy_cache": true 18 | }, 19 | "python.linting.pep8Enabled": true, 20 | "python.linting.pep8Path": "pycodestyle", 21 | "python.linting.pylintEnabled": true, 22 | "python.linting.mypyEnabled": true, 23 | "python.linting.banditEnabled": true, 24 | "python.linting.banditArgs": ["-ll"], 25 | "python.linting.enabled": true, 26 | "[python]": { 27 | "editor.tabSize": 4 28 | }, 29 | } 30 | 31 | ``` 32 | -------------------------------------------------------------------------------- /instructions/lab1.md: -------------------------------------------------------------------------------- 1 | # Lab 1: Single-character prediction 2 | 3 | ## Before you begin, make sure to set up! 4 | 5 | Please complete [Lab Setup](/setup.md) before proceeding! 6 | 7 | ## Goal of the lab 8 | 9 | Train a model to solve a simplified version of the line text recognition problem. 10 | 11 | ## Outline 12 | 13 | - Intro to EMNIST, a character prediction dataset. 14 | - Explore the `networks` and `training` code. 15 | - Train simple MLP/CNN baselines to solve EMNIST. 16 | - Test your model. 17 | 18 | ## Follow along 19 | 20 | ``` 21 | git pull 22 | cd lab1/ 23 | ``` 24 | 25 | ## Intro to EMNIST 26 | 27 | - EMNIST = Extended Mini-NIST :) 28 | - All English letters and digits presented in the MNIST format. 29 | - Look at: `notebooks/01-look-at-emnist.ipynb` 30 | 31 | ## Networks and training code 32 | 33 | ``` 34 | - text_recognizer/networks/mlp.py 35 | - text_recognizer/networks/lenet.py 36 | - text_recognizer/models/base.py 37 | - text_recognizer/models/character_model.py 38 | - training/util.py 39 | ``` 40 | 41 | ## Train MLP and CNN 42 | 43 | You can run the shortcut command `tasks/train_character_predictor.sh`, which runs the following: 44 | 45 | ```sh 46 | training/run_experiment.py --save \ 47 | '{"dataset": "EmnistDataset", "model": "CharacterModel", "network": "mlp", "train_args": {"batch_size": 256}}' 48 | ``` 49 | 50 | It will take a couple of minutes to train your model. 51 | 52 | Just for fun, you could also try a larger MLP, with a smaller batch size: 53 | 54 | ```sh 55 | training/run_experiment.py \ 56 | '{"dataset": "EmnistDataset", "model": "CharacterModel", "network": "mlp", "network_args": {"num_layers": 8}, "train_args": {"batch_size": 128}}' 57 | ``` 58 | 59 | ## Testing 60 | 61 | First, let's take a look at how the test works at 62 | 63 | ``` 64 | text_recognizer/tests/test_character_predictor.py 65 | ``` 66 | 67 | Now let's see if it works by running: 68 | 69 | ```sh 70 | pytest -s text_recognizer/tests/test_character_predictor.py 71 | ``` 72 | 73 | Or, use the shorthand `tasks/test_functionality.sh` 74 | 75 | Testing should finish quickly. 76 | -------------------------------------------------------------------------------- /instructions/lab2.md: -------------------------------------------------------------------------------- 1 | # Lab 2: Convolutional Nets 2 | 3 | ## Goal of the lab 4 | 5 | - Use a simple convolutional network to recognize EMNIST characters. 6 | - Construct a synthetic dataset of EMNIST lines. 7 | - Move from reading single characters to reading lines. 8 | 9 | ## Follow along 10 | 11 | ``` 12 | git pull 13 | cd lab2 14 | ``` 15 | 16 | ## Using a convolutional network for recognizing EMNIST characters 17 | 18 | We left off in Lab 1 having trained an MLP model on the EMNIST characters dataset. 19 | 20 | Let's also train a CNN on the same task. 21 | We can start in the notebook `notebooks/01b-cnn-for-emnist.ipynb`. 22 | 23 | We can also run the same experiment with 24 | 25 | ```sh 26 | training/run_experiment.py '{"dataset": "EmnistDataset", "model": "CharacterModel", "network": "lenet", "train_args": {"epochs": 1}}' 27 | ``` 28 | 29 | Training the single epoch will take about 2 minutes (that's why we only do one epoch in this lab :)). 30 | Leave it running while we go on to the next part. 31 | 32 | ### Subsampling data 33 | 34 | It is very useful to be able to subsample the dataset for quick experiments. 35 | This is possibe by passing `subsample_fraction=0.1` (or some other fraction) at dataset initialization, or in `dataset_args` in the `run_experiment.py` dictionary, for example: 36 | 37 | ```sh 38 | training/run_experiment.py '{"dataset": "EmnistDataset", "dataset_args": {"subsample_fraction": 0.25}, "model": "CharacterModel", "network": "lenet"}' 39 | ``` 40 | 41 | ## Making a synthetic dataset of EMNIST Lines 42 | 43 | - Synthetic dataset we built for this project 44 | - Sample sentences from Brown corpus 45 | - For each character, sample random EMNIST character and place on a line (with some random overlap) 46 | - Look at: `notebooks/02-look-at-emnist-lines.ipynb` 47 | 48 | ## Reading multiple characters at once 49 | 50 | Now that we have a dataset of lines and not just single characters, we can apply our convolutional net to it. 51 | 52 | Let's look at `notebooks/02b-cnn-for-simple-emnist-lines.ipynb`, where we generate a datset with at most 8 characters and no overlap. 53 | 54 | The first network we try is simply the same LeNet network we used for single characters, applied to each character in sequence, using the `TimeDistributed` layer. 55 | 56 | We can also express the same network using all convolutional layers, which we do next. 57 | 58 | We can train this model with a command, too: 59 | 60 | ```sh 61 | python training/run_experiment.py --save '{"train_args": {"epochs": 5}, "dataset": "EmnistLinesDataset", "dataset_args": {"max_length": 8, "max_overlap": 0}, "model": "LineModel", "network": "line_cnn_all_conv"}' 62 | ``` 63 | -------------------------------------------------------------------------------- /instructions/lab3.md: -------------------------------------------------------------------------------- 1 | # Lab 3: Using a sequence model for line text recognition 2 | 3 | ## Goal of the lab 4 | 5 | Use sequence modeling to be able to handle overlapping characters (input sequence no longer maps neatly onto output sequence). 6 | 7 | ## Outline 8 | 9 | - Overview of the model, network, and loss 10 | - Train an LSTM on EMNIST 11 | 12 | ## Follow along 13 | 14 | ``` 15 | git pull 16 | cd lab3 17 | ``` 18 | 19 | ## Overview of model and loss 20 | 21 | - Look at slides for CTC loss 22 | - Look at `networks/line_lstm_ctc.py` 23 | - Look at `models/line_model_ctc.py` 24 | 25 | ## Train LSTM model with CTC loss 26 | 27 | Let's train an LSTM model with CTC loss. 28 | 29 | ```sh 30 | python training/run_experiment.py --save '{"train_args": {"epochs": 16}, "dataset": "EmnistLinesDataset", "model": "LineModelCtc", "network": "line_lstm_ctc"}' 31 | ``` 32 | 33 | or the shortcut `tasks/train_lstm_line_predictor.sh` 34 | 35 | ## Things to try 36 | 37 | If you have time left over, or want to play around with this later on, you can try writing your own non-CTC `line_lstm` network (define it in `text_recognizer/networks/line_lstm.py`). 38 | For example, you could code up an encoder-decoder architecture with attention. 39 | -------------------------------------------------------------------------------- /instructions/lab5.md: -------------------------------------------------------------------------------- 1 | # Lab 5: Line Detection 2 | 3 | At this point, we have trained a model that can recognize text in a line, given an image of a single line. 4 | 5 | ## Goal of the lab 6 | 7 | Our next task is to automatically detect line regions in an image of a whole paragraph of text. 8 | 9 | Our approach will be to train a model that, when given an image containing lines of text, returns a pixelwise labeling of that image, with each pixel belonging to either background, odd line of handwriting, or even line of handwriting. 10 | Given the output of the model, we can find line regions with an easy image processing operation. 11 | 12 | ## Setup 13 | 14 | - As always, `git pull` in the `~/fsdl-text-recognizer-project` repo to get the latest code. 15 | - Then `cd lab5`. 16 | 17 | ## Data 18 | 19 | We are starting from the IAM dataset, which includes not only lines but the original writing sample forms, with each line and word region annotated. 20 | 21 | Let's load the IAM dataset and then look at the data files. 22 | Run `python text_recognizer/datasets/iam_dataset.py` 23 | Let's look at the raw data files, which are in `~/fsdl-text-recognizer-project/data/raw/iam/iamdb/forms`. 24 | 25 | We want to crop out the region of each page corresponding to the handwritten paragraph as our model input, and generate corresponding ground truth. 26 | 27 | Code to do this is in `text_recognizer/datasets/iam_paragraphs_dataset.py` 28 | 29 | We can look at the results in `notebooks/04-look-at-iam-paragraphs.ipynb` and by looking at some debug images we output in `data/interim/iam_paragraphs`. 30 | 31 | ## Training data augmentation 32 | 33 | The model code for our new `LineDetector` is in `text_recognizer/models/line_detector_model.py`. 34 | 35 | Because we only have about a thousand images to learn this task on, data augmentation will be crucial. 36 | Image augmentations such as streching, slight rotations, offsets, contrast and brightness changes, and potentially even mirror-flipping are tedious to code, and most frameworks provide optimized utility code for the task. 37 | 38 | We use Keras's `ImageDataGenerator`, and you can see the parameters for it in `text_recognizer/models/line_detector_model.py`. 39 | We can take a look at what the data transformations look like in the same notebook. 40 | 41 | ## Network description 42 | 43 | The network used in this model is `text_recognizer/networks/fcn.py`. 44 | 45 | The basic idea is a deep convolutional network with resnet-style blocks (input to block is concatenated to block output). 46 | We call it FCN, as in "Fully Convolutional Network," after the seminal paper that first used convnets for segmentation. 47 | 48 | Unlike the original FCN, however, we do not maxpool or upsample, but instead rely on dilated convolutions to rapidly increase the effective receptive field. 49 | [Here](https://fomoro.com/research/articles/receptive-field-calculator) is a very calculator of the effective receptive field size of a convnet. 50 | 51 | The crucial thing to understand is that because we are labeling odd and even lines differently, each predicted pixel must have the context of the entire image to correctly label -- otherwise, there is no way to know whether the pixel is on an odd or even line. 52 | 53 | ## Review results 54 | 55 | The model converges to something really good. 56 | 57 | Check out `notebooks/04b-look-at-line-detector-predictions.ipynb` to see sample predictions on the test set. 58 | 59 | We also plot some sample training data augmentation in that notebook. 60 | 61 | ## Combining the two models 62 | 63 | Now we are ready to combine the new `LineDetector` model and the `LinePredictor` model that we trained yesterday. 64 | 65 | This is done in `text_recognizer/paragraph_text_recognizer.py`, which loads both models, find line regions with one, and runs each crop through the other. 66 | 67 | We can see that it works as expected (albeit not too accurately yet) by running `pytest -s text_recognizer/tests/test_paragraph_text_recognizer.py`. 68 | 69 | ## Things to try 70 | 71 | - Try adding more data augmentations, or mess with the parameters of the existing ones 72 | - Try the U-Net architecture, which MaxPools down and then UpSamples back up, with increased conv layer channel dimensions in the middle (https://lmb.informatik.uni-freiburg.de/people/ronneber/u-net/). 73 | -------------------------------------------------------------------------------- /instructions/lab6.md: -------------------------------------------------------------------------------- 1 | # Lab 6: Data Labeling and Versioning 2 | 3 | In this lab we will annotate the handwriting samples we collected, export and version the resulting data, write an interface to the new data format, and download the pages in parallel. 4 | 5 | ## Data labeling 6 | 7 | We will be using a simple online data annotation web service called Dataturks. 8 | 9 | Please head to the [project page](https://dataturks.com/projects/sergeykarayev/fsdl_handwriting) and log in using our shared credential: `annotator@fullstackdeeplearning.com` (the password will be shared during lab). 10 | 11 | You should be able to start tagging now. 12 | Let's do it together for a little bit, and then you'll have time to do a full page by yourself. 13 | 14 | We'll sync up and review results in a few minutes. 15 | 16 | (Review results and discuss any differences in annotation and how they could be prevented.) 17 | 18 | ## Export data and update metadata file 19 | 20 | Let's now export the data from Dataturks and add it to our version control. 21 | 22 | You have noticed the `metadata.toml` files in all of our `data/raw` directories. 23 | They contain the remote source of the data, the filename it should have when downloaded, and a SHA-256 hash of the downloaded file. 24 | 25 | The idea is that the data file has all the information needed for our dataset. 26 | In our case, it has image URLs and all the annotations we made. 27 | From this, we can download the images, and transform the annotation data into something usable by our training scripts. 28 | The hash, combined with the state of the codebase (tracked by git), then uniquely identifies the data we're going to use to train. 29 | 30 | We replace the current `fsdl_handwriting.json` with the one we just exported, and now need to update the metadata file, since the hash is different. 31 | SHA256 hash of any file can be computed by running `shasum -a 256 `. 32 | We can also update `metadata.toml` with a convenient script that replace the SHA-256 of the current file with the SHA-256 of the new file. 33 | There is a convenience task script defined: `tasks/update_fsdl_paragraphs_metadata.sh`. 34 | 35 | The data file itself is checked into version control, but tracked with git-lfs, as it can get heavyweight and can change frequently as we keep adding and annotating more data. 36 | Note that `git-lfs` actually does something very similar to what we more manually do with `metadata.toml`. 37 | The reason we also use the latter is for standardization across other types of datasets, which may not have a file we want to check into even `git-lfs` -- for example, EMNIST and IAM, which are too large as they include the images. 38 | 39 | ## Download images 40 | 41 | The class `IamHandwritingDataset` in `text_recognizer/datasets/iam_handwriting.py` must be able to load the data in the exported format and present it to consumers in a format they expect (e.g. `dataset.line_regions_by_id`). 42 | 43 | Since this data export does not come with images, but only pointers to remote locations of the images, the class must also be responsible for downloading the images. 44 | 45 | In downloading many images, it is very useful to do so in parallel. 46 | We use the `concurrent.futures.ThreadPoolExecutor` method, and use the `tqdm` package to provide a nice progress bar. 47 | 48 | ## Looking at the data 49 | 50 | We can confirm that we loaded the data correctly by looking at line crops and their corresponding strings. 51 | 52 | Make sure you are in `lab6` directory, and take a look at `notebooks/05-look-at-fsdl-handwriting.ipynb`. 53 | 54 | ## Training on the new dataset 55 | 56 | We're not going to have time to train on the new dataset, but that is something that is now possible. 57 | As an exercise, you could write `FsdlHandwritingLinesDataset` and `FsdlHandwritingParagraphsDataset`, and be able to train a model on a combination of IAM and FSDL Handwriting data on both the line detection and line text prediction tasks. 58 | -------------------------------------------------------------------------------- /instructions/lab7.md: -------------------------------------------------------------------------------- 1 | # Lab 7: Testing and Continuous Integration 2 | 3 | ## Goal of the lab 4 | 5 | - Add evaluation tests 6 | - Add linting to our codebase 7 | - Set up continuous integration via CircleCI, and see our commits pass/fail 8 | 9 | ## Follow along 10 | 11 | ``` 12 | git pull 13 | cd lab7/ 14 | ``` 15 | 16 | ## Linting script 17 | 18 | Running `tasks/lint.sh` fully lints our codebase with a few different checkers: 19 | 20 | - `pipenv check` scans our Python package dependency graph for known security vulnerabilities 21 | - `pylint` does static analysis of Python files and reports both style and bug problems 22 | - `pycodestyle` checks for simple code style guideline violations (somewhat overlapping with `pylint`) 23 | - `mypy` performs static type checking of Python files 24 | - `bandit` performs static analysis to find common security vulnerabilities in Python code 25 | - `shellcheck` finds bugs and potential bugs in shell scrips 26 | 27 | A note: in writing Bash scripts, I often refer to [this excellent guide](http://redsymbol.net/articles/unofficial-bash-strict-mode/). 28 | 29 | Note that the linters are configured using `.pylintrc` and `setup.cfg` files, as well as flags specified in `lint.sh`. 30 | 31 | Getting linting right will pay off in no time, and is a must for any multi-developer codebase. 32 | 33 | ## Setting up CircleCI 34 | 35 | The relevant new files for setting up continuous integration are 36 | 37 | - `evaluation/evaluate_character_predictor.py` 38 | - `evaluation/evaluate_line_predictor.py` 39 | - `tasks/test_validation.sh` 40 | 41 | There is one additional file that is outside of the lab7 directory (in the top-level directory): `.circleci/config.yml` 42 | 43 | Let's set up CircleCI first and then look at the new evaluation files. 44 | 45 | Go to https://circleci.com and log in with your Github account. 46 | Click on Add Project. Select your fork of the `fsdl-text-recognizer-project` repo. 47 | It will ask you to place the `config.yml` file in the repo. 48 | Good news -- it's already there, so you can just hit the "Start building" button. 49 | 50 | While CircleCI starts the build, let's look at the `config.yml` file. 51 | 52 | Let's also check out the new validation test files: they simply evaluate the trained predictors on respective test sets, and make sure they are above threshold accuracy. 53 | 54 | Now that CircleCI is done building, let's push a commit so that we can see it build again, and check out the nice green chechmark in our commit history (https://github.com/sergeyktest/fsdl-text-recognizer-project/commits/master) 55 | -------------------------------------------------------------------------------- /instructions/lab8.md: -------------------------------------------------------------------------------- 1 | # Lab 8: Web Deployment 2 | 3 | ## Goal of the lab 4 | 5 | - Run our LinePredictor as a web app, and send it some requests 6 | - Dockerize our web app 7 | - Deploy our web app to production 8 | 9 | ## Follow along 10 | 11 | ``` 12 | git pull 13 | cd lab8/ 14 | ``` 15 | 16 | This lab has quite a few new files, mostly in the new `api/` directory. 17 | 18 | ## Serving predictions from a web server 19 | 20 | First, we will get a Flask web server up and running and serving predictions. 21 | 22 | ``` 23 | python api/app.py 24 | ``` 25 | 26 | Open up another terminal tab (click on the '+' button under 'File' to open the 27 | launcher). In this terminal, we'll send some test image to the web server 28 | we're running in the first terminal. 29 | 30 | **Make sure to `cd` into the `lab8` directory in this new terminal.** 31 | 32 | ``` 33 | export API_URL=http://0.0.0.0:8000 34 | curl -X POST "${API_URL}/v1/predict" -H 'Content-Type: application/json' --data '{ "image": "data:image/png;base64,'$(base64 -w0 -i text_recognizer/tests/support/emnist_lines/or\ if\ used\ the\ results.png)'" }' 35 | ``` 36 | 37 | If you want to look at the image you just sent, you can navigate to 38 | `lab8/text_recognizer/tests/support/emnist_lines` in the file browser on the 39 | left, and open the image. 40 | 41 | We can also send a request specifying a URL to an image: 42 | ``` 43 | curl "${API_URL}/v1/predict?image_url=http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png" 44 | ``` 45 | 46 | You can shut down your flask server now. 47 | 48 | ## Adding web server tests 49 | 50 | The web server code should have a unit test just like the rest of our code. 51 | 52 | Let's check it out: the tests are in `api/tests/test_app.py`. 53 | You can run them with 54 | 55 | ```sh 56 | tasks/test_api.sh 57 | ``` 58 | 59 | ## Running web server in Docker 60 | 61 | Now, we'll build a docker image with our application. 62 | The Dockerfile in `api/Dockerfile` defines how we're building the docker image. 63 | 64 | Still in the `lab8` directory, run: 65 | 66 | ```sh 67 | tasks/build_api_docker.sh 68 | ``` 69 | 70 | This should take a couple of minutes to complete. 71 | 72 | When it's finished, you can run the server with `tasks/run_api_docker.sh` 73 | 74 | 75 | You can run the same curl commands as you did when you ran the flask server earlier, and see that you're getting the same results. 76 | 77 | ``` 78 | curl -X POST "${API_URL}/v1/predict" -H 'Content-Type: application/json' --data '{ "image": "data:image/png;base64,'$(base64 -w0 -i text_recognizer/tests/support/emnist_lines/or\ if\ used\ the\ results.png)'" }' 79 | 80 | curl "${API_URL}/v1/predict?image_url=http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png" 81 | ``` 82 | 83 | If needed, you can connect to your running docker container by running: 84 | 85 | ```sh 86 | docker exec -it api bash 87 | ``` 88 | 89 | You can shut down your docker container now. 90 | 91 | We could deploy this container to a number of platforms. 92 | In this lab, we will deploy the app as a Docker container using https://render.com 93 | 94 | ## Web deployment 95 | 96 | TODO: render.com 97 | 98 | As before, we can test out our API by running a few curl commands (from the `lab8` directory). We need to change the `API_URL` first though to point it at Lambda: 99 | 100 | ``` 101 | export API_URL="https://REPLACE_THIS.execute-api.us-west-2.amazonaws.com/dev/" 102 | curl -X POST "${API_URL}/v1/predict" -H 'Content-Type: application/json' --data '{ "image": "data:image/png;base64,'$(base64 -w0 -i text_recognizer/tests/support/emnist_lines/or\ if\ used\ the\ results.png)'" }' 103 | curl "${API_URL}/v1/predict?image_url=http://s3-us-west-2.amazonaws.com/fsdl-public-assets/emnist_lines/or%2Bif%2Bused%2Bthe%2Bresults.png" 104 | ``` 105 | 106 | If the POST request fails, it's probably because you are in `api` and not in the top-level `lab8` directory. 107 | 108 | You'll want to run the curl commands a couple of times -- the first execution may time out, because the function has to "warm up." 109 | After the first request, it will stay warm for 10-60 minutes. 110 | -------------------------------------------------------------------------------- /instructions/lab8_notes.md: -------------------------------------------------------------------------------- 1 | # Lab 9 notes 2 | 3 | - Live-code the Flask web app, explaining what's going on 4 | - At the end, should be able to CURL the app running locally with a GET request and a POST request 5 | 6 | - Now, we're going to build it as a Docker container 7 | - go through each line 8 | - cover .dockerignore 9 | 10 | - Now, we're going to deploy to Lambda 11 | -------------------------------------------------------------------------------- /instructions/lab9.md: -------------------------------------------------------------------------------- 1 | # Lab 9: Monitoring running web service 2 | 3 | ## Goals 4 | 5 | - Look at basic metrics and set up a more advanced one 6 | - Experience something going wrong in our deployed service, and catching it with metrics 7 | 8 | ## Monitoring 9 | 10 | We can look at the requests our function is receiving in the AWS CloudWatch interface. 11 | It shows requests, errors, duration, and some other metrics. 12 | 13 | What it does not show is stuff that we care about specifically regarding machine learning: data and prediction distributions. 14 | 15 | This is why we added a few extra metrics to `api/app.py`, in `predict()`. 16 | Using these simple print statements, we can set up CloudWatch metrics by using the Log Metrics functionality. 17 | 18 | ### Log Metrics 19 | 20 | Log in to your AWS Console, and make sure you're in the `us-west-2` region. 21 | 22 | Once you're in, click on 'Services' and go to 'CloudWatch' under 'Management Tools.' Click on 'Logs' in the left sidebar. This will have several log groups -- one for each of us. 23 | You can filter for yours by entering `/aws/lambda/text-recognizer-USERNAME-dev-api` (you need to enter the whole thing, not just your username). 24 | Click on yours. You'll some log streams. If you click on one, you'll see some logs for requests to your API. Each log entry starts with START and ends with REPORT. The REPORT line has some interesting information about the API call, including memory usage and duration. 25 | 26 | We're also logging a couple of metrics for you: the confidences of the predictor and the mean intensities of the input images. 27 | Next, we're going to make it so you can visualize these metrics. Go back to the list of Log Groups by clicking on Logs again in the left sidebar. 28 | Find your log group, but don't click on it. You'll see a column that says 'Metric Filters.' You currently likely have 0 filters. Click on "0 filters." 29 | Click on 'Add Metric Filter.' 30 | 31 | Now, we need to add a pattern for parsing our metric out of the logs. Here's one you can use for the confidence levels. Enter this in the 'Filter Pattern' box. 32 | ``` 33 | [level=METRIC, metric_name=confidence, metric_value] 34 | ``` 35 | Click on 'Assign Metric.' 36 | Now, we need to name the metric and tell it what the data source is. Enter 'USERNAME_confidence' in the 'Metric name' box (replace USERNAME as usual). Click on 'Show advanced metric settings,' and for Metric Value, click on $metric_value to populate the text box. Hit 'Create Filter.' 37 | Since we're already here, let's go ahead and make another metric filter for the mean intensity. You can use this Filter Pattern: 38 | ``` 39 | [level=METRIC, metric_name=mean_intensity, metric_value] 40 | ``` 41 | You should name your metric "USERNAME_mean_intensity." 42 | 43 | Now we have a couple of metric filters set up. 44 | Unfortunately, Metric Filters only apply to new log entries, so go back to your terminal and send a few more requests to your endpoint. 45 | 46 | Now we can make a dashboard that shows our metrics. Click on 'Dashboards' in the left sidebar. Click 'Create Dashboard.' Name your dashboard your USERNAME. 47 | 48 | We're going to add a few widgets to your dashboard. For the first widget, select 'Line'. In the search box, search for your username. 49 | Click on 'Lambda > By Function Name' in the search results, and select the checkbox for 'Invocations.' This'll make a plot showing you much your API is being called. 50 | 51 | Let's add another widget -- select Line again. Go back to the Lambda metrics and select 'Duration' this time. 52 | 53 | Lastly, let's plot our custom metrics. Add one more 'Line' widget, search for your username again, and click on 'LogMetrics' and then 'Metrics with no dimensions'. 54 | Check two checkboxes: `USERNAME_confidence` and `USERNAME_mean_intensity.` Before hitting Create, click on the 'Graphed Metrics' tab above, and under the 'Y Axis' column, 55 | select the right arrow for one of the metrics (it doesn't matter which one). Now hit create. 56 | 57 | Feel free to resize and reorder your widgets. 58 | 59 | Make sure to save your dashboard -- else it won't persist across sessions. 60 | -------------------------------------------------------------------------------- /instructions/lab9_aws_and_monitoring.md: -------------------------------------------------------------------------------- 1 | Note that emailing credentials is a bad idea. You usually want to handle credentials in a more secure fashion. 2 | We're only doing it in this case because your credentials give you limited access and are for a temporary AWS account. 3 | 4 | You can also go to https://379872101858.signin.aws.amazon.com/console and log in with the email you used to register (and the password we emailed you), and create your own credentials if you prefer. 5 | 6 | ## Lambda monitoring 7 | 8 | We're going to check the logs and set up monitoring for your deployed API. In order to make the monitoring more interesting, we're going to simulate people using your API. 9 | 10 | **In order for us to do that, you need to go to https://goo.gl/forms/YQCXTI2k5R5Stq3u2 and submit your endpoint URL.** 11 | It should look like this (ending in "/dev/"): 12 | ``` 13 | https://REPLACE_THIS.execute-api.us-west-2.amazonaws.com/dev/ 14 | ``` 15 | 16 | If you haven't already sent a few requests to your endpoint, you should do so using the curl commands above. 17 | 18 | Next, log in to the AWS Console at https://379872101858.signin.aws.amazon.com/console (you should've gotten an email with your username and password). 19 | 20 | **Make sure that you switch into the Oregon region (also known as `us-west-2`) using the dropdown menu in the top right corner.** 21 | 22 | Once you're in, click on 'Services' and go to 'CloudWatch' under 'Management Tools.' Click on 'Logs' in the left sidebar. This will have several log groups -- one for each of us. 23 | You can filter for yours by entering `/aws/lambda/text-recognizer-USERNAME-dev-api` (you need to enter the whole thing, not just your username). 24 | Click on yours. You'll some log streams. If you click on one, you'll see some logs for requests to your API. Each log entry starts with START and ends with REPORT. The REPORT line has some interesting information about the API call, including memory usage and duration. 25 | 26 | We're also logging a couple of metrics for you: the confidences of the predictor and the mean intensities of the input images. 27 | Next, we're going to make it so you can visualize these metrics. Go back to the list of Log Groups by clicking on Logs again in the left sidebar. 28 | Find your log group, but don't click on it. You'll see a column that says 'Metric Filters.' You currently likely have 0 filters. Click on "0 filters." 29 | Click on 'Add Metric Filter.' 30 | 31 | Now, we need to add a pattern for parsing our metric out of the logs. Here's one you can use for the confidence levels. Enter this in the 'Filter Pattern' box. 32 | ``` 33 | [level=METRIC, metric_name=confidence, metric_value] 34 | ``` 35 | Click on 'Assign Metric.' 36 | Now, we need to name the metric and tell it what the data source is. Enter 'USERNAME_confidence' in the 'Metric name' box (replace USERNAME as usual). Click on 'Show advanced metric settings,' and for Metric Value, click on $metric_value to populate the text box. Hit 'Create Filter.' 37 | Since we're already here, let's go ahead and make another metric filter for the mean intensity. You can use this Filter Pattern: 38 | ``` 39 | [level=METRIC, metric_name=mean_intensity, metric_value] 40 | ``` 41 | You should name your metric "USERNAME_mean_intensity." 42 | 43 | Now we have a couple of metric filters set up. 44 | Unfortunately, Metric Filters only apply to new log entries, so go back to your terminal and send a few more requests to your endpoint. 45 | 46 | Now we can make a dashboard that shows our metrics. Click on 'Dashboards' in the left sidebar. Click 'Create Dashboard.' Name your dashboard your USERNAME. 47 | 48 | We're going to add a few widgets to your dashboard. For the first widget, select 'Line'. In the search box, search for your username. 49 | Click on 'Lambda > By Function Name' in the search results, and select the checkbox for 'Invocations.' This'll make a plot showing you much your API is being called. 50 | 51 | Let's add another widget -- select Line again. Go back to the Lambda metrics and select 'Duration' this time. 52 | 53 | Lastly, let's plot our custom metrics. Add one more 'Line' widget, search for your username again, and click on 'LogMetrics' and then 'Metrics with no dimensions'. 54 | Check two checkboxes: `USERNAME_confidence` and `USERNAME_mean_intensity.` Before hitting Create, click on the 'Graphed Metrics' tab above, and under the 'Y Axis' column, 55 | select the right arrow for one of the metrics (it doesn't matter which one). Now hit create. 56 | 57 | Feel free to resize and reorder your widgets. 58 | 59 | Make sure to save your dashboard -- else it won't persist across sessions. 60 | 61 | You can play with your API here a bit while we turn on the traffic for everyone. Double check that you've submitted your endpoint to the Google form above. 62 | 63 | Once the traffic is going, refresh your dashboard a bit and watch it. We're going to change something about the traffic, and it's going to start making your API perform poorly. 64 | Try and figure out what's going on, and how you can fix it. We'll leave the adversarial traffic on for a while. 65 | 66 | If you're curious, you can add a metric filter to show memory usage with this pattern: 67 | ``` 68 | [report_name="REPORT", request_id_name="RequestId:", request_id_value, duration_name="Duration:", duration_value, duration_unit="ms", billed_duration_name_1="Billed", bill_duration_name_2="Duration:", billed_duration_value, billed_duration_unit="ms", memory_size_name_1="Memory", memory_size_name_2="Size:", memory_size_value, memory_size_unit="MB", max_memory_used_name_1="Max", max_memory_used_name_2="Memory", max_memory_used_name_3="Used:", max_memory_used_value, max_memory_used_unit="MB"] 69 | ``` 70 | 71 | You can name it `USERNAME_memory`. Select `$max_memory_used_value` for the metric value. 72 | 73 | Make sure to save your dashboard! 74 | -------------------------------------------------------------------------------- /instructions/project_structure.md: -------------------------------------------------------------------------------- 1 | # Project Structure 2 | 3 | Before we get going with the labs, let's familiarize ourselves with the high-level design of the codebase. 4 | 5 | ## Follow along 6 | 7 | ``` 8 | cd lab8/ 9 | ``` 10 | 11 | ## Project structure 12 | 13 | Web backend 14 | 15 | ```sh 16 | api/ # Code for serving predictions as a REST API. 17 | tests/test_app.py # Test that predictions are working 18 | Dockerfile # Specifies Docker image that runs the web server. 19 | __init__.py 20 | app.py # Flask web server that serves predictions. 21 | ``` 22 | 23 | Data (not under version control - one level up in the heirarchy) 24 | 25 | ```sh 26 | data/ # Training data lives here 27 | raw/ 28 | emnist/metadata.toml # Specifications for downloading data 29 | ``` 30 | 31 | Experimentation 32 | 33 | ```sh 34 | evaluation/ # Scripts for evaluating model on eval set. 35 | evaluate_character_predictor.py 36 | 37 | notebooks/ # For snapshots of initial exploration, before solidfying code as proper Python files. 38 | 01-look-at-emnist.ipynb 39 | ``` 40 | 41 | Convenience scripts 42 | 43 | ```sh 44 | tasks/ 45 | # Deployment 46 | build_api_docker.sh 47 | 48 | # Code quality 49 | lint.sh 50 | 51 | # Tests 52 | test_api.sh 53 | test_functionality.sh 54 | test_validation.sh 55 | 56 | # Training 57 | train_character_predictor.sh 58 | ``` 59 | 60 | Main model and training code 61 | 62 | ```sh 63 | text_recognizer/ # Package that can be deployed as a self-contained prediction system 64 | __init__.py 65 | 66 | character_predictor.py # Takes a raw image and obtains a prediction 67 | line_predictor.py 68 | 69 | datasets/ # Code for loading datasets 70 | __init__.py 71 | dataset.py # Base class for datasets - logic for downloading data 72 | emnist_dataset.py 73 | emnist_essentials.json 74 | dataset_sequence.py 75 | 76 | models/ # Code for instantiating models, including data preprocessing and loss functions 77 | __init__.py 78 | base.py # Base class for models 79 | character_model.py 80 | 81 | networks/ # Code for building neural networks (i.e., 'dumb' input->output mappings) used by models 82 | __init__.py 83 | mlp.py 84 | 85 | tests/ 86 | support/ # Raw data used by tests 87 | test_character_predictor.py # Test model on a few key examples 88 | 89 | weights/ # Weights for production model 90 | CharacterModel_EmnistDataset_mlp_weights.h5 91 | 92 | util.py 93 | 94 | training/ # Code for running training experiments and selecting the best model. 95 | run_experiment.py # Parse experiment config and launch training. 96 | util.py # Logic for training a model with a given config 97 | ``` 98 | -------------------------------------------------------------------------------- /instructions/readme.md: -------------------------------------------------------------------------------- 1 | # Full Stack Deep Learning Labs 2 | 3 | Welcome! 4 | 5 | Project developed during lab sessions of the [Full Stack Deep Learning Bootcamp](https://fullstackdeeplearning.com). 6 | 7 | - We will build a handwriting recognition system from scratch, and deploy it as a web service. 8 | - Uses Keras, but designed to be modular, hackable, and scalable 9 | - Provides code for training models in parallel and store evaluation in Weights & Biases 10 | - We will set up continuous integration system for our codebase, which will check functionality of code and evaluate the model about to be deployed. 11 | - We will package up the prediction system as a REST API, deployable as a Docker container. 12 | - We will deploy the prediction system as a serverless function to Amazon Lambda. 13 | - Lastly, we will set up monitoring that alerts us when the incoming data distribution changes. 14 | 15 | ## Schedule for the November 2019 Bootcamp 16 | 17 | - First session (90 min) 18 | - [Setup](setup.md) (10 min): Get set up with jupyterhub. 19 | - Introduction to problem and [project structure](project_structure.md) (20 min). 20 | - Gather handwriting data (10 min). 21 | - [Lab 1](lab1.md) (20 min): Introduce EMNIST. Training code details. Train & evaluate character prediction baselines. 22 | - [Lab 2](lab2.md) (30 min): Introduce EMNIST Lines. Overview of CTC loss and model architecture. Train our model on EMNIST Lines. 23 | - Second session (60 min) 24 | - [Lab 3](lab3.md) (40 min): Weights & Biases + parallel experiments 25 | - [Lab 4](lab4.md) (20 min): IAM Lines and experimentation time (hyperparameter sweeps, leave running overnight). 26 | - Third session (90 min) 27 | - Review results from the class on W&B 28 | - [Lab 5](lab5.md) (45 min) Train & evaluate line detection model. 29 | - [Lab 6](lab6.md) (45 min) Label handwriting data generated by the class, download and version results. 30 | - Fourth session (75 min) 31 | - [Lab 7](lab7.md) (15 min) Add continuous integration that runs linting and tests on our codebase. 32 | - [Lab 8](lab8.md) (60 min) Deploy the trained model to the web using AWS Lambda. 33 | -------------------------------------------------------------------------------- /instructions/setup.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | ## 1. Check out the repo 4 | 5 | You should already have the repo in your home directory. Go into it and make sure you have the latest. 6 | 7 | ```sh 8 | cd fsdl-text-recognizer-project 9 | git pull origin master 10 | ``` 11 | 12 | If not, open a shell in your JupyterLab instance and run 13 | 14 | ```sh 15 | git clone https://github.com/full-stack-deep-learning/fsdl-text-recognizer-project.git 16 | cd fsdl-text-recognizer-project 17 | ``` 18 | 19 | ## 2. Set up the Python environment 20 | 21 | ### If on GCP AI Platform Notebooks instance 22 | 23 | Simply run ```pip install -r requirements.txt -r requirements-dev.txt```. 24 | 25 | Also, run ```export PYTHONPATH=.``` before executing any commands later on, or you will get errors like `ModuleNotFoundError: No module named 'text_recognizer'`. 26 | 27 | In order to not have to set `PYTHONPATH` in every terminal you open, just add that line as the last line of the `~/.bashrc` file using a text editor of your choice (e.g. `nano ~/.bashrc`) 28 | 29 | ### If on own machine 30 | 31 | Run `conda env create` to create an environment called `fsdl-text-recognizer`, as defined in `environment.yml`. 32 | This environment will provide us with the right Python version as well as the CUDA and CUDNN libraries. 33 | We will install Python libraries using `pip-sync`, however, which will let us do three nice things: 34 | 35 | 1. Separate out dev from production dependencies (`requirements-dev.in` vs `requirements.in`). 36 | 2. Have a lockfile of exact versions for all dependencies (the auto-generated `requirements-dev.txt` and `requirements.txt`). 37 | 3. Allow us to easily deploy to targets that may not support the `conda` environment. 38 | 39 | So, after running `conda env create`, activate the new environment and install the requirements: 40 | 41 | ```sh 42 | conda activate fsdl-text-recognizer 43 | pip-sync requirements.txt requirements-dev.txt 44 | ``` 45 | 46 | If you add, remove, or need to update versions of some requirements, edit the `.in` files, then run 47 | 48 | ``` 49 | pip-compile requirements.in && pip-compile requirements-dev.in 50 | ``` 51 | 52 | Now, every time you work in this directory, make sure to start your session with `conda activate fsdl-text-recognizer`. 53 | 54 | ## 3. Kick off a command 55 | 56 | Before we get started, please run a command that will take a little bit of time to execute. 57 | 58 | ```sh 59 | cd lab1/ 60 | python text_recognizer/datasets/emnist_dataset.py 61 | cd .. 62 | ``` 63 | 64 | # Ready 65 | 66 | Now you should be setup for the labs. The instructions for each lab are in readme files in their folders. 67 | 68 | You will notice that there are solutions for all the labs right here in the repo, too. 69 | If you get stuck, you are welcome to take a look! 70 | -------------------------------------------------------------------------------- /instructions/setup_extra.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | ## Development on AWS (in progress) 4 | 5 | We will use the [Deep Learning Base AMI](https://aws.amazon.com/marketplace/pp/B07Y3VDBNS) which has NVIDA CUDA and GPU drivers, but no pre-installed deep learning framework Python packages (we will install those ourselves). 6 | 7 | ```sh 8 | AMI="ami-0f4d5f31e6310624e" 9 | TYPE="p2.4xlarge" 10 | aws ec2 run-instances --image-id "$AMI" --instance-type "$TYPE" --key-name id_rsa --security-group-ids=sg-331f3543 11 | ``` 12 | 13 | We'll tag it for later ease of reference 14 | 15 | ```sh 16 | aws ec2 create-tags --resources --tags Key=Name,Value=fsdl 17 | ``` 18 | 19 | We also need to install aws CLI tools, and add two functions to our `.bashrc` or equivalent file 20 | 21 | ```sh 22 | function ec2ip() { 23 | echo $(aws ec2 describe-instances --filters "{\"Name\":\"tag:Name\", \"Values\":[\"$1\"]}" --query='Reservations[0].Instances[0].PublicIpAddress' | tr -d '"') 24 | } 25 | 26 | function ec2id() { 27 | echo $(aws ec2 describe-instances --filters "{\"Name\":\"tag:Name\", \"Values\":[\"$1\"]}" --query='Reservations[0].Instances[0].InstanceId' | tr -d '"') 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /notebooks/archive/02-train-emnist-mlp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "/Users/sergeyk/work/gradescope/full-stack-deep-learning/projects\n", 13 | "The autoreload extension is already loaded. To reload it, use:\n", 14 | " %reload_ext autoreload\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from pathlib import Path\n", 20 | "repo_dirname = Path.cwd().parents[1].resolve()\n", 21 | "print(repo_dirname)\n", 22 | "\n", 23 | "%load_ext autoreload\n", 24 | "%autoreload 2\n", 25 | "\n", 26 | "GPU_IND = 0\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "%matplotlib inline\n", 31 | "\n", 32 | "import sys\n", 33 | "sys.path.append('..')\n", 34 | "\n", 35 | "from text_recognizer.datasets.emnist import EMNIST\n", 36 | "from text_recognizer.models.emnist_mlp import create_mlp_model\n", 37 | "from training.util import train_model" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "data = EmnistDataset()\n", 47 | "num_classes = data.y_train.shape[1]\n", 48 | "input_shape = data.x_train.shape[1]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "_________________________________________________________________\n", 61 | "Layer (type) Output Shape Param # \n", 62 | "=================================================================\n", 63 | "dense (Dense) (None, 128) 100480 \n", 64 | "_________________________________________________________________\n", 65 | "dropout (Dropout) (None, 128) 0 \n", 66 | "_________________________________________________________________\n", 67 | "dense_1 (Dense) (None, 128) 16512 \n", 68 | "_________________________________________________________________\n", 69 | "dropout_1 (Dropout) (None, 128) 0 \n", 70 | "_________________________________________________________________\n", 71 | "dense_2 (Dense) (None, 65) 8385 \n", 72 | "=================================================================\n", 73 | "Total params: 125,377\n", 74 | "Trainable params: 125,377\n", 75 | "Non-trainable params: 0\n", 76 | "_________________________________________________________________\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "model = create_mlp_model(num_classes=num_classes, input_shape=input_shape)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "Train on 523449 samples, validate on 174483 samples\n", 94 | "Epoch 1/1\n", 95 | "523449/523449 [==============================] - 85s 162us/step - loss: 1.0147 - acc: 0.7138 - val_loss: 0.7489 - val_acc: 0.7779\n", 96 | "Training took 85.003911 s\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "history = train_model(model=model, x_train=data.x_train, y_train=data.y_train, epochs=1, batch_size=32, loss='categorical_crossentropy')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 7, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "116323/116323 [==============================] - 3s 26us/step\n", 114 | "Test loss/accuracy: 0.7460721686058285 0.7792096146081137\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "score = model.evaluate(data.x_test, data.y_test, verbose=1)\n", 120 | "print('Test loss/accuracy:', score[0], score[1])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "ename": "NameError", 130 | "evalue": "name '__file__' is not defined", 131 | "output_type": "error", 132 | "traceback": [ 133 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 134 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 135 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpathlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mMODELS_DIRNAME\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpathlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__file__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparents\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresolve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m'models'\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m'emnist_mlp'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf'model.h5'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMODELS_DIRNAME\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 136 | "\u001b[0;31mNameError\u001b[0m: name '__file__' is not defined" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "import pathlib\n", 142 | "MODELS_DIRNAME = pathlib.Path(__file__).parents[1].resolve() / 'models' / 'emnist_mlp'\n", 143 | "filename = f'model.h5'\n", 144 | "model.save(MODELS_DIRNAME / filename)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.6.6" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 2 176 | } 177 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | target-version = ['py37'] 4 | -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | -c requirements.txt 2 | bandit 3 | black 4 | gpustat 5 | gradescope-utils 6 | grequests # admin 7 | itermplot 8 | jupyterlab 9 | matplotlib 10 | mypy 11 | nltk 12 | pycodestyle 13 | pydocstyle 14 | pylint 15 | pytest 16 | pyyaml 17 | redlock-py 18 | tornado 19 | safety 20 | scipy 21 | pillow 22 | wandb 23 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements-dev.in 6 | # 7 | appdirs==1.4.3 # via black, virtualenv 8 | astroid==2.3.3 # via pylint 9 | attrs==19.3.0 # via black, jsonschema, pytest 10 | backcall==0.1.0 # via ipython 11 | bandit==1.6.2 # via -r requirements-dev.in 12 | black==19.10b0 # via -r requirements-dev.in 13 | bleach==3.1.4 # via nbconvert 14 | blessings==1.7 # via gpustat 15 | certifi==2019.11.28 # via -c requirements.txt, pipenv, requests, sentry-sdk 16 | chardet==3.0.4 # via -c requirements.txt, requests 17 | click==7.1.1 # via -c requirements.txt, black, safety, wandb 18 | configparser==5.0.0 # via wandb 19 | cycler==0.10.0 # via matplotlib 20 | decorator==4.4.2 # via ipython, traitlets 21 | defusedxml==0.6.0 # via nbconvert 22 | distlib==0.3.0 # via virtualenv 23 | docker-pycreds==0.4.0 # via wandb 24 | dparse==0.5.0 # via safety 25 | entrypoints==0.3 # via nbconvert 26 | filelock==3.0.12 # via virtualenv 27 | gevent==1.4.0 # via grequests 28 | gitdb==4.0.2 # via gitpython 29 | gitpython==3.1.0 # via bandit, wandb 30 | gpustat==0.6.0 # via -r requirements-dev.in 31 | gql==0.2.0 # via wandb 32 | gradescope-utils==0.3.1 # via -r requirements-dev.in 33 | graphql-core==1.1 # via gql 34 | greenlet==0.4.15 # via gevent 35 | grequests==0.4.0 # via -r requirements-dev.in 36 | idna==2.9 # via -c requirements.txt, requests 37 | importlib-metadata==1.6.0 # via jsonschema, pluggy, pytest, virtualenv 38 | ipykernel==5.2.0 # via notebook 39 | ipython-genutils==0.2.0 # via nbformat, notebook, traitlets 40 | ipython==7.13.0 # via ipykernel 41 | isort==4.3.21 # via pylint 42 | itermplot==0.331 # via -r requirements-dev.in 43 | jedi==0.16.0 # via ipython 44 | jinja2==2.11.1 # via -c requirements.txt, jupyterlab, jupyterlab-server, nbconvert, notebook 45 | json5==0.9.4 # via jupyterlab-server 46 | jsonschema==3.2.0 # via jupyterlab-server, nbformat 47 | jupyter-client==6.1.2 # via ipykernel, notebook 48 | jupyter-core==4.6.3 # via jupyter-client, nbconvert, nbformat, notebook 49 | jupyterlab-server==1.0.7 # via jupyterlab 50 | jupyterlab==2.0.1 # via -r requirements-dev.in 51 | kiwisolver==1.1.0 # via matplotlib 52 | lazy-object-proxy==1.4.3 # via astroid 53 | markupsafe==1.1.1 # via -c requirements.txt, jinja2 54 | matplotlib==3.2.1 # via -r requirements-dev.in, itermplot 55 | mccabe==0.6.1 # via pylint 56 | mistune==0.8.4 # via nbconvert 57 | more-itertools==8.2.0 # via pytest 58 | mypy-extensions==0.4.3 # via mypy 59 | mypy==0.770 # via -r requirements-dev.in 60 | nbconvert==5.6.1 # via notebook 61 | nbformat==5.0.4 # via nbconvert, notebook 62 | nltk==3.4.5 # via -r requirements-dev.in 63 | notebook==6.0.3 # via jupyterlab, jupyterlab-server 64 | numpy==1.18.2 # via -c requirements.txt, itermplot, matplotlib, scipy 65 | nvidia-ml-py3==7.352.0 # via gpustat, wandb 66 | packaging==20.3 # via dparse, pytest, safety 67 | pandocfilters==1.4.2 # via nbconvert 68 | parso==0.6.2 # via jedi 69 | pathspec==0.7.0 # via black 70 | pathtools==0.1.2 # via watchdog 71 | pbr==5.4.4 # via stevedore 72 | pexpect==4.8.0 # via ipython 73 | pickleshare==0.7.5 # via ipython 74 | pillow==7.0.0 # via -r requirements-dev.in 75 | pipenv==2018.11.26 # via dparse 76 | pluggy==0.13.1 # via pytest 77 | prometheus-client==0.7.1 # via notebook 78 | promise==2.3 # via gql, graphql-core 79 | prompt-toolkit==3.0.5 # via ipython 80 | psutil==5.7.0 # via gpustat, wandb 81 | ptyprocess==0.6.0 # via pexpect, terminado 82 | py==1.8.1 # via pytest 83 | pycodestyle==2.5.0 # via -r requirements-dev.in 84 | pydocstyle==5.0.2 # via -r requirements-dev.in 85 | pygments==2.6.1 # via ipython, nbconvert 86 | pylint==2.4.4 # via -r requirements-dev.in 87 | pyparsing==2.4.6 # via matplotlib, packaging 88 | pyrsistent==0.16.0 # via jsonschema 89 | pytest==5.4.1 # via -r requirements-dev.in 90 | python-dateutil==2.8.1 # via jupyter-client, matplotlib, wandb 91 | pyyaml==5.3.1 # via -r requirements-dev.in, bandit, dparse, wandb 92 | pyzmq==19.0.0 # via jupyter-client, notebook 93 | redis==3.4.1 # via redlock-py 94 | redlock-py==1.0.8 # via -r requirements-dev.in 95 | regex==2020.2.20 # via black 96 | requests==2.23.0 # via -c requirements.txt, gql, grequests, safety, wandb 97 | safety==1.8.7 # via -r requirements-dev.in 98 | scipy==1.4.1 # via -c requirements.txt, -r requirements-dev.in 99 | send2trash==1.5.0 # via notebook 100 | sentry-sdk==0.14.3 # via wandb 101 | shortuuid==1.0.1 # via wandb 102 | six==1.14.0 # via -c requirements.txt, astroid, bandit, bleach, blessings, cycler, docker-pycreds, gpustat, gql, graphql-core, itermplot, jsonschema, nltk, packaging, promise, pyrsistent, python-dateutil, stevedore, traitlets, virtualenv, wandb 103 | smmap==3.0.1 # via gitdb 104 | snowballstemmer==2.0.0 # via pydocstyle 105 | stevedore==1.32.0 # via bandit 106 | subprocess32==3.5.4 # via wandb 107 | terminado==0.8.3 # via notebook 108 | testpath==0.4.4 # via nbconvert 109 | toml==0.10.0 # via -c requirements.txt, black, dparse 110 | tornado==6.0.4 # via -r requirements-dev.in, ipykernel, jupyter-client, jupyterlab, notebook, terminado 111 | traitlets==4.3.3 # via ipykernel, ipython, jupyter-client, jupyter-core, nbconvert, nbformat, notebook 112 | typed-ast==1.4.1 # via astroid, black, mypy 113 | typing-extensions==3.7.4.1 # via mypy 114 | urllib3==1.25.8 # via -c requirements.txt, requests, sentry-sdk 115 | virtualenv-clone==0.5.4 # via pipenv 116 | virtualenv==20.0.15 # via pipenv 117 | wandb==0.8.31 # via -r requirements-dev.in 118 | watchdog==0.10.2 # via wandb 119 | wcwidth==0.1.9 # via prompt-toolkit, pytest 120 | webencodings==0.5.1 # via bleach 121 | wrapt==1.11.2 # via -c requirements.txt, astroid 122 | zipp==3.1.0 # via importlib-metadata 123 | 124 | # The following packages are considered to be unsafe in a requirements file: 125 | # pip 126 | # setuptools 127 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | boltons 2 | editdistance 3 | flask 4 | h5py 5 | numpy 6 | opencv-python-headless 7 | requests 8 | tensorflow==2.2.0rc2 9 | toml 10 | tqdm 11 | wrapt==1.11.* # due to pylint 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | absl-py==0.9.0 # via tensorboard, tensorflow 8 | astunparse==1.6.3 # via tensorflow 9 | boltons==20.0.0 # via -r requirements.in 10 | cachetools==4.0.0 # via google-auth 11 | certifi==2019.11.28 # via requests 12 | chardet==3.0.4 # via requests 13 | click==7.1.1 # via flask 14 | editdistance==0.5.3 # via -r requirements.in 15 | flask==1.1.1 # via -r requirements.in 16 | gast==0.3.3 # via tensorflow 17 | google-auth-oauthlib==0.4.1 # via tensorboard 18 | google-auth==1.12.0 # via google-auth-oauthlib, tensorboard 19 | google-pasta==0.2.0 # via tensorflow 20 | grpcio==1.27.2 # via tensorboard, tensorflow 21 | h5py==2.10.0 # via -r requirements.in, tensorflow 22 | idna==2.9 # via requests 23 | itsdangerous==1.1.0 # via flask 24 | jinja2==2.11.1 # via flask 25 | keras-preprocessing==1.1.0 # via tensorflow 26 | markdown==3.2.1 # via tensorboard 27 | markupsafe==1.1.1 # via jinja2 28 | numpy==1.18.2 # via -r requirements.in, h5py, keras-preprocessing, opencv-python-headless, opt-einsum, scipy, tensorboard, tensorflow 29 | oauthlib==3.1.0 # via requests-oauthlib 30 | opencv-python-headless==4.2.0.32 # via -r requirements.in 31 | opt-einsum==3.2.0 # via tensorflow 32 | protobuf==3.11.3 # via tensorboard, tensorflow 33 | pyasn1-modules==0.2.8 # via google-auth 34 | pyasn1==0.4.8 # via pyasn1-modules, rsa 35 | requests-oauthlib==1.3.0 # via google-auth-oauthlib 36 | requests==2.23.0 # via -r requirements.in, requests-oauthlib, tensorboard 37 | rsa==4.0 # via google-auth 38 | scipy==1.4.1 # via tensorflow 39 | six==1.14.0 # via absl-py, astunparse, google-auth, google-pasta, grpcio, h5py, keras-preprocessing, protobuf, tensorboard, tensorflow 40 | tensorboard-plugin-wit==1.6.0.post2 # via tensorboard 41 | tensorboard==2.2.0 # via tensorflow 42 | tensorflow-estimator==2.2.0rc0 # via tensorflow 43 | tensorflow==2.2.0rc2 # via -r requirements.in 44 | termcolor==1.1.0 # via tensorflow 45 | toml==0.10.0 # via -r requirements.in 46 | tqdm==4.44.1 # via -r requirements.in 47 | urllib3==1.25.8 # via requests 48 | werkzeug==1.0.0 # via flask, tensorboard 49 | wheel==0.34.2 # via astunparse, tensorboard, tensorflow 50 | wrapt==1.11.2 # via -r requirements.in, tensorflow 51 | 52 | # The following packages are considered to be unsafe in a requirements file: 53 | # setuptools 54 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | max-line-length = 120 3 | ignore = E203,W503 4 | 5 | [pydocstyle] 6 | convention = numpy 7 | add-ignore = D102,D103,D104,D105,D200,D205,D400 8 | 9 | [mypy] 10 | ignore_missing_imports = True 11 | 12 | [tool:pytest] 13 | addopts = --doctest-modules 14 | -------------------------------------------------------------------------------- /tasks/build_api_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sed 's/tensorflow==/tensorflow-cpu==/' requirements.txt > api/requirements.txt 4 | 5 | docker build -t text_recognizer_api -f api/Dockerfile . 6 | -------------------------------------------------------------------------------- /tasks/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | find . -name "__pycache__" -exec rm -r {} \; 4 | -------------------------------------------------------------------------------- /tasks/format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | black . 4 | -------------------------------------------------------------------------------- /tasks/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -uo pipefail 3 | set +e 4 | 5 | FAILURE=false 6 | 7 | echo "safety" 8 | safety check -r requirements.txt -r requirements-dev.txt || FAILURE=true 9 | 10 | echo "pylint" 11 | pylint api text_recognizer training || FAILURE=true 12 | 13 | echo "pycodestyle" 14 | pycodestyle api text_recognizer training || FAILURE=true 15 | 16 | echo "pydocstyle" 17 | pydocstyle api text_recognizer training || FAILURE=true 18 | 19 | echo "mypy" 20 | mypy api text_recognizer training || FAILURE=true 21 | 22 | echo "bandit" 23 | bandit -ll -r {api,text_recognizer,training} || FAILURE=true 24 | 25 | echo "shellcheck" 26 | shellcheck tasks/*.sh || FAILURE=true 27 | 28 | if [ "$FAILURE" = true ]; then 29 | echo "Linting failed" 30 | exit 1 31 | fi 32 | echo "Linting passed" 33 | exit 0 34 | -------------------------------------------------------------------------------- /tasks/prepare_sample_experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python training/prepare_experiments.py training/experiments/sample.json 3 | -------------------------------------------------------------------------------- /tasks/run_api_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker run -p 8000:8000 --name api -it --rm text_recognizer_api 3 | -------------------------------------------------------------------------------- /tasks/sync_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip-sync requirements-dev.txt requirements.txt 4 | -------------------------------------------------------------------------------- /tasks/test_api.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pytest -s api 3 | -------------------------------------------------------------------------------- /tasks/test_functionality.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pytest -s text_recognizer 3 | -------------------------------------------------------------------------------- /tasks/test_validation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pytest -s evaluation/evaluate* 3 | -------------------------------------------------------------------------------- /tasks/train_character_predictor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python training/run_experiment.py --save '{"dataset": "EmnistDataset", "model": "CharacterModel", "network": "mlp", "train_args": {"batch_size": 256}}' 3 | -------------------------------------------------------------------------------- /tasks/train_line_detector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python training/run_experiment.py --gpu=1 --save '{"dataset": "IamParagraphsDataset", "model": "LineDetectorModel", "network": "fcn", "train_args": {"batch_size": 16, "epochs": 32}}' 3 | -------------------------------------------------------------------------------- /tasks/train_lstm_line_predictor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python training/run_experiment.py --save '{"dataset": "EmnistLinesDataset", "model": "LineModelCtc", "network": "line_lstm_ctc"}' 3 | -------------------------------------------------------------------------------- /tasks/train_lstm_line_predictor_on_iam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python training/run_experiment.py --save '{"dataset": "IamLinesDataset", "model": "LineModelCtc", "network": "line_lstm_ctc"}' 3 | -------------------------------------------------------------------------------- /tasks/update_fsdl_paragraphs_metadata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python training/update_metadata.py data/raw/fsdl_handwriting/metadata.toml 4 | -------------------------------------------------------------------------------- /tasks/update_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip-compile -v requirements.in && pip-compile -v requirements-dev.in 4 | -------------------------------------------------------------------------------- /text_recognizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/the-full-stack/fsdl-text-recognizer/a99a3d3f0594dfceb249a56e8362337f9e12897e/text_recognizer/__init__.py -------------------------------------------------------------------------------- /text_recognizer/character_predictor.py: -------------------------------------------------------------------------------- 1 | """CharacterPredictor class""" 2 | from typing import Tuple, Union 3 | 4 | import numpy as np 5 | 6 | from text_recognizer.models import CharacterModel 7 | import text_recognizer.util as util 8 | 9 | 10 | class CharacterPredictor: 11 | """Given an image of a single handwritten character, recognizes it.""" 12 | 13 | def __init__(self): 14 | self.model = CharacterModel() 15 | self.model.load_weights() 16 | 17 | def predict(self, image_or_filename: Union[np.ndarray, str]) -> Tuple[str, float]: 18 | """Predict on a single image.""" 19 | if isinstance(image_or_filename, str): 20 | image = util.read_image(image_or_filename, grayscale=True) 21 | else: 22 | image = image_or_filename 23 | return self.model.predict_on_image(image) 24 | 25 | def evaluate(self, dataset): 26 | """Evaluate on a dataset.""" 27 | return self.model.evaluate(dataset.x_test, dataset.y_test) 28 | -------------------------------------------------------------------------------- /text_recognizer/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Dataset modules.""" 2 | from .emnist_dataset import EmnistDataset 3 | 4 | # Hide lines below until Lab 2 5 | from .emnist_lines_dataset import EmnistLinesDataset 6 | 7 | # Hide lines above until Lab 2 8 | # Hide lines below until Lab 4 9 | from .iam_lines_dataset import IamLinesDataset 10 | 11 | # Hide lines above until Lab 4 12 | # Hide lines below until Lab 5 13 | from .iam_dataset import IamDataset 14 | from .iam_paragraphs_dataset import IamParagraphsDataset 15 | 16 | # Hide lines above until Lab 5 17 | # Hide lines below until Lab 6 18 | from .fsdl_handwriting_dataset import FsdlHandwritingDataset 19 | 20 | # Hide lines above until Lab 6 21 | -------------------------------------------------------------------------------- /text_recognizer/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset class to be extended by dataset-specific classes.""" 2 | from pathlib import Path 3 | import argparse 4 | import os 5 | 6 | from text_recognizer import util 7 | 8 | 9 | class Dataset: 10 | """Simple abstract class for datasets.""" 11 | 12 | @classmethod 13 | def data_dirname(cls): 14 | return Path(__file__).resolve().parents[2] / "data" 15 | 16 | def load_or_generate_data(self): 17 | pass 18 | 19 | 20 | def _download_raw_dataset(metadata): 21 | if os.path.exists(metadata["filename"]): 22 | return 23 | print(f"Downloading raw dataset from {metadata['url']}...") 24 | util.download_url(metadata["url"], metadata["filename"]) 25 | print("Computing SHA-256...") 26 | sha256 = util.compute_sha256(metadata["filename"]) 27 | if sha256 != metadata["sha256"]: 28 | raise ValueError("Downloaded data file SHA-256 does not match that listed in metadata document.") 29 | 30 | 31 | def _parse_args(): 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument( 34 | "--subsample_fraction", type=float, default=None, help="If given, is used as the fraction of data to expose.", 35 | ) 36 | return parser.parse_args() 37 | -------------------------------------------------------------------------------- /text_recognizer/datasets/dataset_sequence.py: -------------------------------------------------------------------------------- 1 | """DatasetSequence class.""" 2 | import numpy as np 3 | from tensorflow.keras.utils import Sequence 4 | 5 | 6 | def _shuffle(x, y): 7 | """Shuffle x and y maintaining their association.""" 8 | shuffled_indices = np.random.permutation(x.shape[0]) 9 | return x[shuffled_indices], y[shuffled_indices] 10 | 11 | 12 | class DatasetSequence(Sequence): 13 | """ 14 | Minimal implementation of https://keras.io/utils/#sequence. 15 | """ 16 | 17 | def __init__(self, x, y, batch_size=32, augment_fn=None, format_fn=None): 18 | self.x = x 19 | self.y = y 20 | self.batch_size = batch_size 21 | self.augment_fn = augment_fn 22 | self.format_fn = format_fn 23 | 24 | def __len__(self): 25 | """Return length of the dataset.""" 26 | return int(np.ceil(len(self.x) / float(self.batch_size))) 27 | 28 | def __getitem__(self, idx): 29 | """Return a single batch.""" 30 | # idx = 0 # If you want to intentionally overfit to just one batch 31 | begin = idx * self.batch_size 32 | end = (idx + 1) * self.batch_size 33 | 34 | # batch_x = np.take(self.x, range(begin, end), axis=0, mode='clip') 35 | # batch_y = np.take(self.y, range(begin, end), axis=0, mode='clip') 36 | 37 | batch_x = self.x[begin:end] 38 | batch_y = self.y[begin:end] 39 | 40 | if batch_x.dtype == np.uint8: 41 | batch_x = (batch_x / 255).astype(np.float32) 42 | 43 | if self.augment_fn: 44 | batch_x, batch_y = self.augment_fn(batch_x, batch_y) 45 | 46 | if self.format_fn: 47 | batch_x, batch_y = self.format_fn(batch_x, batch_y) 48 | 49 | return batch_x, batch_y 50 | 51 | def on_epoch_end(self) -> None: 52 | """Shuffle data.""" 53 | self.x, self.y = _shuffle(self.x, self.y) 54 | -------------------------------------------------------------------------------- /text_recognizer/datasets/emnist_essentials.json: -------------------------------------------------------------------------------- 1 | {"mapping": [[0, "0"], [1, "1"], [2, "2"], [3, "3"], [4, "4"], [5, "5"], [6, "6"], [7, "7"], [8, "8"], [9, "9"], [10, "A"], [11, "B"], [12, "C"], [13, "D"], [14, "E"], [15, "F"], [16, "G"], [17, "H"], [18, "I"], [19, "J"], [20, "K"], [21, "L"], [22, "M"], [23, "N"], [24, "O"], [25, "P"], [26, "Q"], [27, "R"], [28, "S"], [29, "T"], [30, "U"], [31, "V"], [32, "W"], [33, "X"], [34, "Y"], [35, "Z"], [36, "a"], [37, "b"], [38, "c"], [39, "d"], [40, "e"], [41, "f"], [42, "g"], [43, "h"], [44, "i"], [45, "j"], [46, "k"], [47, "l"], [48, "m"], [49, "n"], [50, "o"], [51, "p"], [52, "q"], [53, "r"], [54, "s"], [55, "t"], [56, "u"], [57, "v"], [58, "w"], [59, "x"], [60, "y"], [61, "z"]], "input_shape": [28, 28]} -------------------------------------------------------------------------------- /text_recognizer/datasets/emnist_lines_dataset.py: -------------------------------------------------------------------------------- 1 | """Emnist Lines dataset: synthetic handwriting lines dataset made from EMNIST characters.""" 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | import h5py 6 | import numpy as np 7 | from tensorflow.keras.utils import to_categorical 8 | 9 | from text_recognizer.datasets.dataset import Dataset 10 | from text_recognizer.datasets.emnist_dataset import EmnistDataset 11 | 12 | 13 | DATA_DIRNAME = Dataset.data_dirname() / "processed" / "emnist_lines" 14 | ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_lines_essentials.json" 15 | 16 | 17 | class EmnistLinesDataset(Dataset): 18 | """ 19 | EmnistLinesDataset class. 20 | 21 | Parameters 22 | ---------- 23 | max_length 24 | Max line length in characters. 25 | max_overlap 26 | Max overlap between characters in a line. 27 | num_train 28 | Number of training examples to generate. 29 | num_test 30 | Number of test examples to generate. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | max_length: int = 34, 36 | min_overlap: float = 0, 37 | max_overlap: float = 0.33, 38 | num_train: int = 10000, 39 | num_test: int = 1000, 40 | ): 41 | self.emnist = EmnistDataset() 42 | self.mapping = self.emnist.mapping 43 | self.max_length = max_length 44 | self.min_overlap = min_overlap 45 | self.max_overlap = max_overlap 46 | self.num_classes = len(self.mapping) 47 | self.input_shape = ( 48 | self.emnist.input_shape[0], 49 | self.emnist.input_shape[1] * self.max_length, 50 | ) 51 | self.output_shape = (self.max_length, self.num_classes) 52 | self.num_train = num_train 53 | self.num_test = num_test 54 | self.x_train = None 55 | self.y_train = None 56 | self.x_test = None 57 | self.y_test = None 58 | 59 | @property 60 | def data_filename(self): 61 | return ( 62 | DATA_DIRNAME 63 | / f"ml_{self.max_length}_o{self.min_overlap}_{self.max_overlap}_ntr{self.num_train}_nte{self.num_test}.h5" 64 | ) 65 | 66 | def load_or_generate_data(self): 67 | np.random.seed(42) 68 | 69 | if not self.data_filename.exists(): 70 | self._generate_data("train") 71 | self._generate_data("test") 72 | self._load_data() 73 | 74 | def __repr__(self): 75 | return ( 76 | "EMNIST Lines Dataset\n" # pylint: disable=no-member 77 | f"Max length: {self.max_length}\n" 78 | f"Min overlap: {self.min_overlap}\n" 79 | f"Max overlap: {self.max_overlap}\n" 80 | f"Num classes: {self.num_classes}\n" 81 | f"Input shape: {self.input_shape}\n" 82 | f"Train: {self.x_train.shape} {self.y_train.shape}\n" 83 | f"Test: {self.x_test.shape} {self.y_test.shape}\n" 84 | ) 85 | 86 | def _load_data(self): 87 | print("EmnistLinesDataset loading data from HDF5...") 88 | with h5py.File(self.data_filename, "r") as f: 89 | self.x_train = f["x_train"][:] 90 | self.y_train = f["y_train"][:] 91 | self.x_test = f["x_test"][:] 92 | self.y_test = f["y_test"][:] 93 | 94 | def _generate_data(self, split): 95 | print("EmnistLinesDataset generating data...") 96 | 97 | # pylint: disable=import-outside-toplevel 98 | from text_recognizer.datasets.sentence_generator import SentenceGenerator 99 | 100 | sentence_generator = SentenceGenerator(self.max_length) 101 | 102 | emnist = self.emnist 103 | emnist.load_or_generate_data() 104 | if split == "train": 105 | samples_by_char = get_samples_by_char(emnist.x_train, emnist.y_train_int, emnist.mapping) 106 | else: 107 | samples_by_char = get_samples_by_char(emnist.x_test, emnist.y_test_int, emnist.mapping) 108 | 109 | num = self.num_train if split == "train" else self.num_test 110 | 111 | DATA_DIRNAME.mkdir(parents=True, exist_ok=True) 112 | with h5py.File(self.data_filename, "a") as f: 113 | x, y = create_dataset_of_images( 114 | num, samples_by_char, sentence_generator, self.min_overlap, self.max_overlap 115 | ) 116 | y = convert_strings_to_categorical_labels(y, emnist.inverse_mapping) 117 | f.create_dataset(f"x_{split}", data=x, dtype="u1", compression="lzf") 118 | f.create_dataset(f"y_{split}", data=y, dtype="u1", compression="lzf") 119 | 120 | 121 | def get_samples_by_char(samples, labels, mapping): 122 | samples_by_char = defaultdict(list) 123 | for sample, label in zip(samples, labels.flatten()): 124 | samples_by_char[mapping[label]].append(sample) 125 | return samples_by_char 126 | 127 | 128 | def select_letter_samples_for_string(string, samples_by_char): 129 | zero_image = np.zeros((28, 28), np.uint8) 130 | sample_image_by_char = {} 131 | for char in string: 132 | if char in sample_image_by_char: 133 | continue 134 | samples = samples_by_char[char] 135 | sample = samples[np.random.choice(len(samples))] if samples else zero_image 136 | sample_image_by_char[char] = sample.reshape(28, 28) 137 | return [sample_image_by_char[char] for char in string] 138 | 139 | 140 | def construct_image_from_string( 141 | string: str, samples_by_char: dict, min_overlap: float, max_overlap: float 142 | ) -> np.ndarray: 143 | overlap = np.random.uniform(min_overlap, max_overlap) 144 | sampled_images = select_letter_samples_for_string(string, samples_by_char) 145 | N = len(sampled_images) 146 | H, W = sampled_images[0].shape 147 | next_overlap_width = W - int(overlap * W) 148 | concatenated_image = np.zeros((H, W * N), np.uint8) 149 | x = 0 150 | for image in sampled_images: 151 | concatenated_image[:, x : (x + W)] += image 152 | x += next_overlap_width 153 | return np.minimum(255, concatenated_image) 154 | 155 | 156 | def create_dataset_of_images(N, samples_by_char, sentence_generator, min_overlap, max_overlap): 157 | sample_label = sentence_generator.generate() 158 | sample_image = construct_image_from_string(sample_label, samples_by_char, 0, 0) # sample_image has 0 overlap 159 | images = np.zeros( 160 | (N, sample_image.shape[0], sample_image.shape[1]), np.uint8, # pylint: disable=unsubscriptable-object 161 | ) 162 | labels = [] 163 | for n in range(N): 164 | label = None 165 | for _ in range(10): # Try several times to generate before actually erroring 166 | try: 167 | label = sentence_generator.generate() 168 | break 169 | except Exception: # pylint: disable=broad-except 170 | pass 171 | if label is None: 172 | raise RuntimeError("Was not able to generate a valid string") 173 | images[n] = construct_image_from_string(label, samples_by_char, min_overlap, max_overlap) 174 | labels.append(label) 175 | return images, labels 176 | 177 | 178 | def convert_strings_to_categorical_labels(labels, mapping): 179 | return np.array([to_categorical([mapping[c] for c in label], num_classes=len(mapping)) for label in labels]) 180 | 181 | 182 | def main(): 183 | dataset = EmnistLinesDataset() 184 | dataset.load_or_generate_data() 185 | print(dataset) 186 | 187 | 188 | if __name__ == "__main__": 189 | main() 190 | -------------------------------------------------------------------------------- /text_recognizer/datasets/fsdl_handwriting_dataset.py: -------------------------------------------------------------------------------- 1 | """Class for loading our own FSDL Handwriting dataset, which encompasses both paragraphs and lines.""" 2 | import json 3 | 4 | import numpy as np 5 | import toml 6 | 7 | from text_recognizer import util 8 | from text_recognizer.datasets.dataset import Dataset 9 | 10 | 11 | RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "fsdl_handwriting" 12 | METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" 13 | PAGES_DIRNAME = RAW_DATA_DIRNAME / "pages" 14 | 15 | 16 | class FsdlHandwritingDataset(Dataset): 17 | """ 18 | FSDL Handwriting dataset gathered in class. 19 | """ 20 | 21 | def __init__(self): 22 | self.metadata = toml.load(METADATA_FILENAME) 23 | with open(RAW_DATA_DIRNAME / self.metadata["filename"]) as f: 24 | page_data = [json.loads(line) for line in f.readlines()] 25 | # NOTE: pylint bug https://github.com/PyCQA/pylint/issues/3164 26 | # pylint: disable=unnecessary-comprehension 27 | self.data_by_page_id = { 28 | id_: data for id_, data in (_extract_id_and_data(page_datum) for page_datum in page_data) 29 | } 30 | # pylint: enable=unnecessary-comprehension 31 | 32 | def load_or_generate_data(self): 33 | if len(self.page_filenames) < len(self.data_by_page_id): 34 | self._download_pages() 35 | 36 | @property 37 | def page_filenames(self): 38 | return list(PAGES_DIRNAME.glob("*.jpg")) 39 | 40 | def _download_pages(self): 41 | PAGES_DIRNAME.mkdir(exist_ok=True, parents=True) 42 | ids, urls = zip(*[(id_, data["url"]) for id_, data in self.data_by_page_id.items()]) 43 | filenames = [PAGES_DIRNAME / id_ for id_ in ids] 44 | util.download_urls(urls, filenames) 45 | 46 | @property 47 | def line_regions_by_id(self): 48 | """Return a dict from name of IAM form to a list of (x1, x2, y1, y2) coordinates of all lines in it.""" 49 | return {id_: data["regions"] for id_, data in self.data_by_page_id.items()} 50 | 51 | @property 52 | def line_strings_by_id(self): 53 | """Return a dict from name of image to a list of strings.""" 54 | return {id_: data["strings"] for id_, data in self.data_by_page_id.items()} 55 | 56 | def __repr__(self): 57 | return "FSDH Handwriting Dataset\n" f"Num pages: {len(self.data_by_page_id)}\n" 58 | 59 | 60 | def _extract_id_and_data(page_datum): 61 | """ 62 | page_datum is of the form 63 | { 64 | 'label': ['line'], 65 | 'shape': 'rectangle', 66 | 'points': [ 67 | [0.1422924901185771, 0.18948824343015216], 68 | [0.875494071146245, 0.18948824343015216], 69 | [0.875494071146245, 0.25034578146611347], 70 | [0.1422924901185771, 0.25034578146611347] 71 | ], 72 | 'notes': 'A MOVE to stop Mr. Gaitskiell from', 73 | 'imageWidth': 1240, 74 | 'imageHeight': 1771 75 | } 76 | """ 77 | url = page_datum["content"] 78 | id_ = url.split("/")[-1] 79 | regions = [] 80 | strings = [] 81 | try: 82 | for annotation in page_datum["annotation"]: 83 | points = np.array(annotation["points"]) 84 | x1, y1 = points.min(0) 85 | x2, y2 = points.max(0) 86 | regions.append( 87 | { 88 | "x1": int(x1 * annotation["imageWidth"]), 89 | "y1": int(y1 * annotation["imageHeight"]), 90 | "x2": int(x2 * annotation["imageWidth"]), 91 | "y2": int(y2 * annotation["imageHeight"]), 92 | } 93 | ) 94 | strings.append(annotation["notes"]) 95 | except Exception: # pylint: disable=broad-except 96 | pass 97 | return id_, {"url": url, "regions": regions, "strings": strings} 98 | 99 | 100 | def main(): 101 | dataset = FsdlHandwritingDataset() 102 | dataset.load_or_generate_data() 103 | print(dataset) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /text_recognizer/datasets/iam_dataset.py: -------------------------------------------------------------------------------- 1 | """Class for loading the IAM dataset, which encompasses both paragraphs and lines, with associated utilities.""" 2 | import os 3 | from typing import Dict, List 4 | import xml.etree.ElementTree as ElementTree 5 | import zipfile 6 | 7 | from boltons.cacheutils import cachedproperty 8 | import toml 9 | 10 | from text_recognizer.datasets.dataset import Dataset, _download_raw_dataset 11 | 12 | 13 | RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "iam" 14 | METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" 15 | EXTRACTED_DATASET_DIRNAME = RAW_DATA_DIRNAME / "iamdb" 16 | 17 | DOWNSAMPLE_FACTOR = 2 # If images were downsampled, the regions must also be. 18 | LINE_REGION_PADDING = 0 # add this many pixels around the exact coordinates 19 | 20 | 21 | class IamDataset(Dataset): 22 | """ 23 | "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text, 24 | which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels. 25 | From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database 26 | 27 | The data split we will use is 28 | IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines. 29 | The validation set has been merged into the train set. 30 | The train set has 7,101 lines from 326 writers. 31 | The test set has 1,861 lines from 128 writers. 32 | The text lines of all data sets are mutually exclusive, thus each writer has contributed to one set only. 33 | """ 34 | 35 | def __init__(self): 36 | self.metadata = toml.load(METADATA_FILENAME) 37 | 38 | def load_or_generate_data(self): 39 | if not self.xml_filenames: 40 | self._download_iam() 41 | 42 | @property 43 | def xml_filenames(self): 44 | return list((EXTRACTED_DATASET_DIRNAME / "xml").glob("*.xml")) 45 | 46 | @property 47 | def form_filenames(self): 48 | return list((EXTRACTED_DATASET_DIRNAME / "forms").glob("*.jpg")) 49 | 50 | def _download_iam(self): 51 | curdir = os.getcwd() 52 | os.chdir(RAW_DATA_DIRNAME) 53 | _download_raw_dataset(self.metadata) 54 | _extract_raw_dataset(self.metadata) 55 | os.chdir(curdir) 56 | 57 | @property 58 | def form_filenames_by_id(self): 59 | return {filename.stem: filename for filename in self.form_filenames} 60 | 61 | @cachedproperty 62 | def line_strings_by_id(self): 63 | """Return a dict from name of IAM form to a list of line texts in it.""" 64 | return {filename.stem: _get_line_strings_from_xml_file(filename) for filename in self.xml_filenames} 65 | 66 | @cachedproperty 67 | def line_regions_by_id(self): 68 | """Return a dict from name of IAM form to a list of (x1, x2, y1, y2) coordinates of all lines in it.""" 69 | return {filename.stem: _get_line_regions_from_xml_file(filename) for filename in self.xml_filenames} 70 | 71 | def __repr__(self): 72 | """Print info about the dataset.""" 73 | return "IAM Dataset\n" f"Num forms: {len(self.xml_filenames)}\n" 74 | 75 | 76 | def _extract_raw_dataset(metadata): 77 | print("Extracting IAM data") 78 | with zipfile.ZipFile(metadata["filename"], "r") as zip_file: 79 | zip_file.extractall() 80 | 81 | 82 | def _get_line_strings_from_xml_file(filename: str) -> List[str]: 83 | """Get the text content of each line. Note that we replace " with ".""" 84 | xml_root_element = ElementTree.parse(filename).getroot() # nosec 85 | xml_line_elements = xml_root_element.findall("handwritten-part/line") 86 | return [el.attrib["text"].replace(""", '"') for el in xml_line_elements] 87 | 88 | 89 | def _get_line_regions_from_xml_file(filename: str) -> List[Dict[str, int]]: 90 | """Get the line region dict for each line.""" 91 | xml_root_element = ElementTree.parse(filename).getroot() # nosec 92 | xml_line_elements = xml_root_element.findall("handwritten-part/line") 93 | return [_get_line_region_from_xml_element(el) for el in xml_line_elements] 94 | 95 | 96 | def _get_line_region_from_xml_element(xml_line) -> Dict[str, int]: 97 | """ 98 | Parameters 99 | ---------- 100 | xml_line 101 | xml element that has x, y, width, and height attributes 102 | """ 103 | word_elements = xml_line.findall("word/cmp") 104 | x1s = [int(el.attrib["x"]) for el in word_elements] 105 | y1s = [int(el.attrib["y"]) for el in word_elements] 106 | x2s = [int(el.attrib["x"]) + int(el.attrib["width"]) for el in word_elements] 107 | y2s = [int(el.attrib["y"]) + int(el.attrib["height"]) for el in word_elements] 108 | return { 109 | "x1": min(x1s) // DOWNSAMPLE_FACTOR - LINE_REGION_PADDING, 110 | "y1": min(y1s) // DOWNSAMPLE_FACTOR - LINE_REGION_PADDING, 111 | "x2": max(x2s) // DOWNSAMPLE_FACTOR + LINE_REGION_PADDING, 112 | "y2": max(y2s) // DOWNSAMPLE_FACTOR + LINE_REGION_PADDING, 113 | } 114 | 115 | 116 | def main(): 117 | dataset = IamDataset() 118 | dataset.load_or_generate_data() 119 | print(dataset) 120 | 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /text_recognizer/datasets/iam_lines_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | IamLinesDataset class. 3 | 4 | We will use a processed version of this dataset, without including code that did the processing. 5 | We will look at how to generate processed data from raw IAM data in the IamParagraphsDataset. 6 | """ 7 | 8 | from boltons.cacheutils import cachedproperty 9 | import h5py 10 | from tensorflow.keras.utils import to_categorical 11 | 12 | from text_recognizer import util 13 | from text_recognizer.datasets.dataset import Dataset, _parse_args 14 | from text_recognizer.datasets.emnist_dataset import EmnistDataset 15 | 16 | 17 | PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / "processed" / "iam_lines" 18 | PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "iam_lines.h5" 19 | PROCESSED_DATA_URL = "https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam_lines.h5" 20 | 21 | 22 | class IamLinesDataset(Dataset): 23 | """ 24 | 25 | Note that we use cachedproperty because data takes time to load. 26 | """ 27 | 28 | def __init__(self, subsample_fraction: float = None): 29 | self.mapping = EmnistDataset().mapping 30 | self.inverse_mapping = {v: k for k, v in self.mapping.items()} 31 | self.num_classes = len(self.mapping) 32 | self.input_shape = (28, 952) 33 | self.output_shape = (97, self.num_classes) 34 | 35 | self.subsample_fraction = subsample_fraction 36 | self.x_train = None 37 | self.x_test = None 38 | self.y_train_int = None 39 | self.y_test_int = None 40 | 41 | def load_or_generate_data(self): 42 | """Load or generate dataset data.""" 43 | if not PROCESSED_DATA_FILENAME.exists(): 44 | PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) 45 | print("Downloading IAM lines...") 46 | util.download_url(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME) 47 | with h5py.File(PROCESSED_DATA_FILENAME, "r") as f: 48 | self.x_train = f["x_train"][:] 49 | self.y_train_int = f["y_train"][:] 50 | self.x_test = f["x_test"][:] 51 | self.y_test_int = f["y_test"][:] 52 | self._subsample() 53 | 54 | def _subsample(self): 55 | """Only this fraction of data will be loaded.""" 56 | if self.subsample_fraction is None: 57 | return 58 | num_train = int(self.x_train.shape[0] * self.subsample_fraction) 59 | num_test = int(self.x_test.shape[0] * self.subsample_fraction) 60 | self.x_train = self.x_train[:num_train] 61 | self.y_train_int = self.y_train_int[:num_train] 62 | self.x_test = self.x_test[:num_test] 63 | self.y_test_int = self.y_test_int[:num_test] 64 | 65 | @cachedproperty 66 | def y_train(self): 67 | """Return y_train""" 68 | return to_categorical(self.y_train_int, self.num_classes) 69 | 70 | @cachedproperty 71 | def y_test(self): 72 | """Return y_test""" 73 | return to_categorical(self.y_test_int, self.num_classes) 74 | 75 | def __repr__(self): 76 | """Print info about the dataset.""" 77 | return ( 78 | "IAM Lines Dataset\n" # pylint: disable=no-member 79 | f"Num classes: {self.num_classes}\n" 80 | f"Mapping: {self.mapping}\n" 81 | f"Train: {self.x_train.shape} {self.y_train.shape}\n" 82 | f"Test: {self.x_test.shape} {self.y_test.shape}\n" 83 | ) 84 | 85 | 86 | def main(): 87 | """Load dataset and print info.""" 88 | args = _parse_args() 89 | dataset = IamLinesDataset(subsample_fraction=args.subsample_fraction) 90 | dataset.load_or_generate_data() 91 | print(dataset) 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /text_recognizer/datasets/sentence_generator.py: -------------------------------------------------------------------------------- 1 | """SentenceGenerator class and supporting functions.""" 2 | import itertools 3 | import re 4 | import string 5 | from typing import Optional 6 | 7 | import nltk 8 | import numpy as np 9 | 10 | from text_recognizer.datasets.dataset import Dataset 11 | 12 | NLTK_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "nltk" 13 | 14 | 15 | class SentenceGenerator: 16 | """Generate text sentences using the Brown corpus.""" 17 | 18 | def __init__(self, max_length: Optional[int] = None): 19 | self.text = brown_text() 20 | self.word_start_inds = [0] + [_.start(0) + 1 for _ in re.finditer(" ", self.text)] 21 | self.max_length = max_length 22 | 23 | def generate(self, max_length: Optional[int] = None) -> str: 24 | """ 25 | Sample a string from text of the Brown corpus of length at least one word and at most max_length, 26 | padding it to max_length with the '_' character. 27 | """ 28 | if max_length is None: 29 | max_length = self.max_length 30 | if max_length is None: 31 | raise ValueError("Must provide max_length to this method or when making this object.") 32 | 33 | ind = np.random.randint(0, len(self.word_start_inds) - 1) 34 | start_ind = self.word_start_inds[ind] 35 | end_ind_candidates = [] 36 | for ind in range(ind + 1, len(self.word_start_inds)): 37 | if self.word_start_inds[ind] - start_ind > max_length: 38 | break 39 | end_ind_candidates.append(self.word_start_inds[ind]) 40 | end_ind = np.random.choice(end_ind_candidates) 41 | sampled_text = self.text[start_ind:end_ind].strip() 42 | padding = "_" * (max_length - len(sampled_text)) 43 | return sampled_text + padding 44 | 45 | 46 | def brown_text(): 47 | """Return a single string with the Brown corpus with all punctuation stripped.""" 48 | sents = load_nltk_brown_corpus() 49 | text = " ".join(itertools.chain.from_iterable(sents)) 50 | text = text.translate({ord(c): None for c in string.punctuation}) 51 | text = re.sub(" +", " ", text) 52 | return text 53 | 54 | 55 | def load_nltk_brown_corpus(): 56 | """Load the Brown corpus using the NLTK library.""" 57 | nltk.data.path.append(NLTK_DATA_DIRNAME) 58 | try: 59 | nltk.corpus.brown.sents() 60 | except LookupError: 61 | NLTK_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) 62 | nltk.download("brown", download_dir=NLTK_DATA_DIRNAME) 63 | return nltk.corpus.brown.sents() 64 | -------------------------------------------------------------------------------- /text_recognizer/line_predictor.py: -------------------------------------------------------------------------------- 1 | """LinePredictor class""" 2 | from typing import Tuple, Union 3 | 4 | import numpy as np 5 | 6 | from text_recognizer.models import LineModelCtc 7 | from text_recognizer.datasets import EmnistLinesDataset 8 | import text_recognizer.util as util 9 | 10 | 11 | class LinePredictor: 12 | """Given an image of a line of handwritten text, recognizes text contents.""" 13 | 14 | def __init__(self, dataset_cls=EmnistLinesDataset): 15 | self.model = LineModelCtc(dataset_cls=dataset_cls) 16 | self.model.load_weights() 17 | 18 | def predict(self, image_or_filename: Union[np.ndarray, str]) -> Tuple[str, float]: 19 | """Predict on a single image.""" 20 | if isinstance(image_or_filename, str): 21 | image = util.read_image(image_or_filename, grayscale=True) 22 | else: 23 | image = image_or_filename 24 | return self.model.predict_on_image(image) 25 | 26 | def evaluate(self, dataset): 27 | """Evaluate on a dataset.""" 28 | return self.model.evaluate(dataset.x_test, dataset.y_test) 29 | -------------------------------------------------------------------------------- /text_recognizer/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Model modules.""" 2 | from .character_model import CharacterModel 3 | 4 | # Hide lines below until Lab 2 5 | from .line_model import LineModel 6 | 7 | # Hide lines above until Lab 2 8 | 9 | # Hide lines below until Lab 3 10 | from .line_model_ctc import LineModelCtc 11 | 12 | # Hide lines above until Lab 3 13 | 14 | # Hide lines below until Lab 5 15 | from .line_detector_model import LineDetectorModel 16 | 17 | # Hide lines above until Lab 5 18 | -------------------------------------------------------------------------------- /text_recognizer/models/base.py: -------------------------------------------------------------------------------- 1 | """Model class, to be extended by specific types of models.""" 2 | # pylint: disable=missing-function-docstring 3 | from pathlib import Path 4 | from typing import Callable, Dict, Optional 5 | 6 | from tensorflow.keras.models import Model as KerasModel 7 | from tensorflow.keras.optimizers import RMSprop 8 | import numpy as np 9 | 10 | from text_recognizer.datasets.dataset_sequence import DatasetSequence 11 | 12 | 13 | DIRNAME = Path(__file__).parents[1].resolve() / "weights" 14 | 15 | 16 | class Model: 17 | """Base class, to be subclassed by predictors for specific type of data.""" 18 | 19 | def __init__( 20 | self, 21 | dataset_cls: type, 22 | network_fn: Callable[..., KerasModel], 23 | dataset_args: Dict = None, 24 | network_args: Dict = None, 25 | ): 26 | self.name = f"{self.__class__.__name__}_{dataset_cls.__name__}_{network_fn.__name__}" 27 | 28 | if dataset_args is None: 29 | dataset_args = {} 30 | self.data = dataset_cls(**dataset_args) 31 | 32 | if network_args is None: 33 | network_args = {} 34 | self.network = network_fn(self.data.input_shape, self.data.output_shape, **network_args) 35 | self.network.summary() 36 | 37 | self.batch_augment_fn: Optional[Callable] = None 38 | self.batch_format_fn: Optional[Callable] = None 39 | 40 | @property 41 | def image_shape(self): 42 | return self.data.input_shape 43 | 44 | @property 45 | def weights_filename(self) -> str: 46 | DIRNAME.mkdir(parents=True, exist_ok=True) 47 | return str(DIRNAME / f"{self.name}_weights.h5") 48 | 49 | def fit( 50 | self, dataset, batch_size: int = 32, epochs: int = 10, augment_val: bool = True, callbacks: list = None, 51 | ): 52 | if callbacks is None: 53 | callbacks = [] 54 | 55 | self.network.compile(loss=self.loss(), optimizer=self.optimizer(), metrics=self.metrics()) 56 | 57 | train_sequence = DatasetSequence( 58 | dataset.x_train, 59 | dataset.y_train, 60 | batch_size, 61 | augment_fn=self.batch_augment_fn, 62 | format_fn=self.batch_format_fn, 63 | ) 64 | test_sequence = DatasetSequence( 65 | dataset.x_test, 66 | dataset.y_test, 67 | batch_size, 68 | augment_fn=self.batch_augment_fn if augment_val else None, 69 | format_fn=self.batch_format_fn, 70 | ) 71 | 72 | self.network.fit( 73 | train_sequence, 74 | epochs=epochs, 75 | callbacks=callbacks, 76 | validation_data=test_sequence, 77 | use_multiprocessing=False, 78 | workers=1, 79 | shuffle=True, 80 | ) 81 | 82 | def evaluate(self, x: np.ndarray, y: np.ndarray, batch_size: int = 16, _verbose: bool = False): 83 | # pylint: disable=unused-argument 84 | sequence = DatasetSequence(x, y, batch_size=batch_size) # Use a small batch size to use less memory 85 | preds = self.network.predict(sequence) 86 | return np.mean(np.argmax(preds, -1) == np.argmax(y, -1)) 87 | 88 | def loss(self): # pylint: disable=no-self-use 89 | return "categorical_crossentropy" 90 | 91 | def optimizer(self): # pylint: disable=no-self-use 92 | return RMSprop() 93 | 94 | def metrics(self): # pylint: disable=no-self-use 95 | return ["accuracy"] 96 | 97 | def load_weights(self): 98 | self.network.load_weights(self.weights_filename) 99 | 100 | def save_weights(self): 101 | self.network.save_weights(self.weights_filename) 102 | -------------------------------------------------------------------------------- /text_recognizer/models/character_model.py: -------------------------------------------------------------------------------- 1 | """CharacterModel class.""" 2 | from typing import Callable, Dict, Tuple 3 | 4 | import numpy as np 5 | 6 | from text_recognizer.models.base import Model 7 | from text_recognizer.datasets.emnist_dataset import EmnistDataset 8 | from text_recognizer.networks.mlp import mlp 9 | 10 | 11 | class CharacterModel(Model): 12 | """CharacterModel works on datasets providing images, with one-hot labels.""" 13 | 14 | def __init__( 15 | self, 16 | dataset_cls: type = EmnistDataset, 17 | network_fn: Callable = mlp, 18 | dataset_args: Dict = None, 19 | network_args: Dict = None, 20 | ): 21 | super().__init__(dataset_cls, network_fn, dataset_args, network_args) 22 | 23 | def predict_on_image(self, image: np.ndarray) -> Tuple[str, float]: 24 | if image.dtype == np.uint8: 25 | image = (image / 255).astype(np.float32) 26 | # NOTE: integer to character mapping dictionary is self.data.mapping[integer] 27 | # Your code below (Lab 1) 28 | pred_raw = self.network.predict(np.expand_dims(image, 0), batch_size=1).flatten() 29 | ind = np.argmax(pred_raw) 30 | confidence_of_prediction = pred_raw[ind] 31 | predicted_character = self.data.mapping[ind] 32 | # Your code above (Lab 1) 33 | return predicted_character, confidence_of_prediction 34 | -------------------------------------------------------------------------------- /text_recognizer/models/line_detector_model.py: -------------------------------------------------------------------------------- 1 | """Define LineDetectorModel class.""" 2 | from typing import Callable, Dict, Tuple 3 | 4 | import numpy as np 5 | 6 | from tensorflow.keras.optimizers import Adam 7 | from tensorflow.keras.preprocessing.image import ImageDataGenerator 8 | 9 | from text_recognizer.datasets.iam_paragraphs_dataset import IamParagraphsDataset 10 | from text_recognizer.models.base import Model 11 | from text_recognizer.networks import fcn 12 | 13 | 14 | _DATA_AUGMENTATION_PARAMS = { 15 | "width_shift_range": 0.06, 16 | "height_shift_range": 0.1, 17 | "horizontal_flip": True, 18 | "zoom_range": 0.1, 19 | "fill_mode": "constant", 20 | "cval": 0, 21 | "shear_range": 3, 22 | } 23 | 24 | 25 | class LineDetectorModel(Model): 26 | """Model to detect lines of text in an image.""" 27 | 28 | def __init__( 29 | self, 30 | dataset_cls: type = IamParagraphsDataset, 31 | network_fn: Callable = fcn, 32 | dataset_args: Dict = None, 33 | network_args: Dict = None, 34 | ): 35 | """Define the default dataset and network values for this model.""" 36 | super().__init__(dataset_cls, network_fn, dataset_args, network_args) 37 | 38 | self.data_augmentor = ImageDataGenerator(**_DATA_AUGMENTATION_PARAMS) 39 | self.batch_augment_fn = self.augment_batch 40 | 41 | def loss(self): # pylint: disable=no-self-use 42 | return "categorical_crossentropy" 43 | 44 | def optimizer(self): # pylint: disable=no-self-use 45 | return Adam(0.001 / 2) 46 | 47 | def metrics(self): # pylint: disable=no-self-use 48 | return None 49 | 50 | def augment_batch(self, x_batch: np.ndarray, y_batch: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 51 | """Perform different random transformations on the whole batch of x, y samples.""" 52 | x_augment, y_augment = zip(*[self._augment_sample(x, y) for x, y in zip(x_batch, y_batch)]) 53 | return np.stack(x_augment, axis=0), np.stack(y_augment, axis=0) 54 | 55 | def _augment_sample(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 56 | """ 57 | Perform the same random image transformation on both x and y. 58 | x is a 2d image of shape self.image_shape, but self.data_augmentor needs the channel image too. 59 | """ 60 | x_3d = np.expand_dims(x, axis=-1) 61 | transform_parameters = self.data_augmentor.get_random_transform(x_3d.shape) 62 | x_augment = self.data_augmentor.apply_transform(x_3d, transform_parameters) 63 | y_augment = self.data_augmentor.apply_transform(y, transform_parameters) 64 | return np.squeeze(x_augment, axis=-1), y_augment 65 | 66 | def predict_on_image(self, x: np.ndarray) -> np.ndarray: 67 | """Predict on a single input.""" 68 | return self.network.predict(np.expand_dims(x, axis=0))[0] 69 | 70 | def evaluate(self, x: np.ndarray, y: np.ndarray, batch_size: int = 32, verbose: bool = False) -> float: 71 | """Evaluate the model.""" 72 | # pylint: disable=unused-argument 73 | return self.network.evaluate(x, y, batch_size=batch_size) 74 | -------------------------------------------------------------------------------- /text_recognizer/models/line_model.py: -------------------------------------------------------------------------------- 1 | """Define LineModel class.""" 2 | from typing import Callable, Dict, Tuple 3 | 4 | import editdistance 5 | import numpy as np 6 | 7 | from text_recognizer.datasets.emnist_lines_dataset import EmnistLinesDataset 8 | from text_recognizer.datasets.dataset_sequence import DatasetSequence 9 | from text_recognizer.models.base import Model 10 | from text_recognizer.networks import line_cnn_all_conv 11 | 12 | 13 | class LineModel(Model): 14 | """Model for predicting a string from an image of a handwritten line of text.""" 15 | 16 | def __init__( 17 | self, 18 | dataset_cls: type = EmnistLinesDataset, 19 | network_fn: Callable = line_cnn_all_conv, 20 | dataset_args: Dict = None, 21 | network_args: Dict = None, 22 | ): 23 | """Define the default dataset and network values for this model.""" 24 | super().__init__(dataset_cls, network_fn, dataset_args, network_args) 25 | 26 | def evaluate(self, x, y, batch_size=16, verbose=True): 27 | """Evaluate model.""" 28 | sequence = DatasetSequence(x, y) 29 | preds_raw = self.network.predict(sequence) 30 | trues = np.argmax(y, -1) 31 | preds = np.argmax(preds_raw, -1) 32 | pred_strings = ["".join(self.data.mapping.get(label, "") for label in pred).strip(" |_") for pred in preds] 33 | true_strings = ["".join(self.data.mapping.get(label, "") for label in true).strip(" |_") for true in trues] 34 | char_accuracies = [ 35 | 1 - editdistance.eval(true_string, pred_string) / len(true_string) 36 | for pred_string, true_string in zip(pred_strings, true_strings) 37 | ] 38 | if verbose: 39 | sorted_ind = np.argsort(char_accuracies) 40 | print("\nLeast accurate predictions:") 41 | for ind in sorted_ind[:5]: 42 | print(f"True: {true_strings[ind]}") 43 | print(f"Pred: {pred_strings[ind]}") 44 | print("\nMost accurate predictions:") 45 | for ind in sorted_ind[-5:]: 46 | print(f"True: {true_strings[ind]}") 47 | print(f"Pred: {pred_strings[ind]}") 48 | print("\nRandom predictions:") 49 | random_ind = np.random.randint(0, len(char_accuracies), 5) 50 | for ind in random_ind: # pylint: disable=not-an-iterable 51 | print(f"True: {true_strings[ind]}") 52 | print(f"Pred: {pred_strings[ind]}") 53 | mean_accuracy = np.mean(char_accuracies) 54 | return mean_accuracy 55 | 56 | def predict_on_image(self, image: np.ndarray) -> Tuple[str, float]: 57 | """Predict on a single input.""" 58 | if image.dtype == np.uint8: 59 | image = (image / 255).astype(np.float32) 60 | pred_raw = self.network.predict(np.expand_dims(image, 0), batch_size=1).squeeze() 61 | pred = "".join(self.data.mapping[label] for label in np.argmax(pred_raw, axis=-1).flatten()).strip() 62 | conf = np.min(np.max(pred_raw, axis=-1)) # The least confident of the predictions. 63 | return pred, conf 64 | -------------------------------------------------------------------------------- /text_recognizer/models/line_model_ctc.py: -------------------------------------------------------------------------------- 1 | """Define LineModelCtc class and associated functions.""" 2 | from typing import Callable, Dict, Tuple 3 | 4 | import editdistance 5 | import numpy as np 6 | import tensorflow.keras.backend as K 7 | from tensorflow.keras.models import Model as KerasModel 8 | 9 | from text_recognizer.datasets.dataset_sequence import DatasetSequence 10 | from text_recognizer.datasets import EmnistLinesDataset 11 | from text_recognizer.models.base import Model 12 | from text_recognizer.networks.line_lstm_ctc import line_lstm_ctc 13 | 14 | 15 | class LineModelCtc(Model): 16 | """Model for recognizing handwritten text in an image of a line, using CTC loss/decoding.""" 17 | 18 | def __init__( 19 | self, 20 | dataset_cls: type = EmnistLinesDataset, 21 | network_fn: Callable = line_lstm_ctc, 22 | dataset_args: Dict = None, 23 | network_args: Dict = None, 24 | ): 25 | """Define the default dataset and network values for this model.""" 26 | default_dataset_args: dict = {} 27 | if dataset_args is None: 28 | dataset_args = {} 29 | dataset_args = {**default_dataset_args, **dataset_args} 30 | 31 | default_network_args = {"window_width": 12, "window_stride": 5} 32 | if network_args is None: 33 | network_args = {} 34 | network_args = {**default_network_args, **network_args} 35 | super().__init__(dataset_cls, network_fn, dataset_args, network_args) 36 | self.batch_format_fn = format_batch_ctc 37 | 38 | def loss(self): 39 | """Simply pass through the loss that we computed in the network.""" 40 | return {"ctc_loss": lambda y_true, y_pred: y_pred} 41 | 42 | def metrics(self): 43 | """ 44 | Compute no metrics. 45 | 46 | TODO: We could probably pass in a custom character accuracy metric for 'ctc_decoded' output here. 47 | """ 48 | return None 49 | 50 | def evaluate(self, x, y, batch_size: int = 16, verbose: bool = True) -> float: 51 | """Evaluate model.""" 52 | test_sequence = DatasetSequence(x, y, batch_size, format_fn=self.batch_format_fn) 53 | 54 | # We can use the `ctc_decoded` layer that is part of our model here. 55 | decoding_model = KerasModel(inputs=self.network.input, outputs=self.network.get_layer("ctc_decoded").output) 56 | preds = decoding_model.predict(test_sequence) 57 | 58 | trues = np.argmax(y, -1) 59 | pred_strings = ["".join(self.data.mapping.get(label, "") for label in pred).strip(" |_") for pred in preds] 60 | true_strings = ["".join(self.data.mapping.get(label, "") for label in true).strip(" |_") for true in trues] 61 | 62 | char_accuracies = [ 63 | 1 - editdistance.eval(true_string, pred_string) / len(true_string) 64 | for pred_string, true_string in zip(pred_strings, true_strings) 65 | ] 66 | if verbose: 67 | sorted_ind = np.argsort(char_accuracies) 68 | print("\nLeast accurate predictions:") 69 | for ind in sorted_ind[:5]: 70 | print(f"True: {true_strings[ind]}") 71 | print(f"Pred: {pred_strings[ind]}") 72 | print("\nMost accurate predictions:") 73 | for ind in sorted_ind[-5:]: 74 | print(f"True: {true_strings[ind]}") 75 | print(f"Pred: {pred_strings[ind]}") 76 | print("\nRandom predictions:") 77 | random_ind = np.random.randint(0, len(char_accuracies), 5) 78 | for ind in random_ind: # pylint: disable=not-an-iterable 79 | print(f"True: {true_strings[ind]}") 80 | print(f"Pred: {pred_strings[ind]}") 81 | mean_accuracy = np.mean(char_accuracies) 82 | return mean_accuracy 83 | 84 | def predict_on_image(self, image: np.ndarray) -> Tuple[str, float]: 85 | """Predict on a single input.""" 86 | softmax_output_fn = KerasModel( 87 | inputs=[self.network.get_layer("image").input], outputs=[self.network.get_layer("softmax_output").output], 88 | ) 89 | if image.dtype == np.uint8: 90 | image = (image / 255).astype(np.float32) 91 | 92 | # Get the prediction and confidence using softmax_output_fn, passing the right input into it. 93 | input_image = np.expand_dims(image, 0) 94 | softmax_output = softmax_output_fn.predict(input_image) 95 | 96 | input_length = [softmax_output.shape[1]] 97 | decoded, log_prob = K.ctc_decode(softmax_output, input_length, greedy=True) 98 | 99 | pred_raw = K.eval(decoded[0])[0] 100 | pred = "".join(self.data.mapping[label] for label in pred_raw).strip() 101 | 102 | neg_sum_logit = K.eval(log_prob)[0][0] 103 | conf = np.exp(-neg_sum_logit) 104 | # Your code above (Lab 3) 105 | 106 | return pred, conf 107 | 108 | 109 | def format_batch_ctc(batch_x, batch_y): 110 | """ 111 | Because CTC loss needs to be computed inside of the network, we include information about outputs in the inputs. 112 | """ 113 | batch_size = batch_y.shape[0] 114 | y_true = np.argmax(batch_y, axis=-1) 115 | 116 | label_lengths = [] 117 | for ind in range(batch_size): 118 | # Find all of the indices in the label that are blank 119 | empty_at = np.where(batch_y[ind, :, -1] == 1)[0] 120 | # Length of the label is the pos of the first blank, or the max length 121 | if empty_at.shape[0] > 0: 122 | label_lengths.append(empty_at[0]) 123 | else: 124 | label_lengths.append(batch_y.shape[1]) 125 | 126 | batch_inputs = { 127 | "image": batch_x, 128 | "y_true": y_true, 129 | "input_length": np.ones((batch_size, 1)), # dummy, will be set to num_windows in network 130 | "label_length": np.array(label_lengths), 131 | } 132 | batch_outputs = {"ctc_loss": np.zeros(batch_size), "ctc_decoded": y_true} # dummy 133 | return batch_inputs, batch_outputs 134 | -------------------------------------------------------------------------------- /text_recognizer/networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Neural network code modules.""" 2 | from .mlp import mlp 3 | from .lenet import lenet 4 | 5 | # Hide lines below until Lab 2 6 | from .line_cnn_all_conv import line_cnn_all_conv 7 | 8 | # Hide lines above until Lab 2 9 | 10 | # Hide lines below until Lab 3 11 | from .line_lstm_ctc import line_lstm_ctc 12 | 13 | # Hide lines above until Lab 3 14 | 15 | # Hide lines below until Lab 5 16 | from .fcn import fcn 17 | 18 | # Hide lines above until Lab 5 19 | -------------------------------------------------------------------------------- /text_recognizer/networks/ctc.py: -------------------------------------------------------------------------------- 1 | """Define ctc_decode function.""" 2 | import tensorflow as tf 3 | import tensorflow.keras.backend as K 4 | from tensorflow.python.ops import ctc_ops, sparse_ops # pylint: disable=no-name-in-module 5 | 6 | 7 | def ctc_decode(y_pred, input_length, max_output_length): 8 | """ 9 | Cut down from https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L4170 10 | 11 | Decodes the output of a softmax. 12 | Uses greedy (best path) search. 13 | 14 | # Arguments 15 | y_pred: tensor `(samples, time_steps, num_categories)` 16 | containing the prediction, or output of the softmax. 17 | input_length: tensor `(samples, )` containing the sequence length for 18 | each batch item in `y_pred`. 19 | max_output_length: int giving the max output sequence length 20 | 21 | # Returns 22 | List: list of one element that contains the decoded sequence. 23 | """ 24 | y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon()) 25 | input_length = tf.cast((tf.squeeze(input_length, axis=-1)), tf.int32) 26 | 27 | (decoded, _) = ctc_ops.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length) 28 | 29 | sparse = decoded[0] 30 | decoded_dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values, default_value=-1) 31 | 32 | # Unfortunately, decoded_dense will be of different number of columns, depending on the decodings. 33 | # For use in `predict()`, we need to get it all in one standard shape, so let's pad if necessary. 34 | max_length = max_output_length + 2 # giving 2 extra characters for CTC leeway 35 | cols = tf.shape(decoded_dense)[-1] 36 | 37 | def pad(): 38 | return tf.pad(decoded_dense, [[0, 0], [0, max_length - cols]], constant_values=-1) 39 | 40 | def noop(): 41 | return decoded_dense 42 | 43 | return tf.cond(tf.less(cols, max_length), pad, noop) 44 | -------------------------------------------------------------------------------- /text_recognizer/networks/fcn.py: -------------------------------------------------------------------------------- 1 | """Keras network code for the fully-convolutional network used for line detection.""" 2 | from typing import List, Tuple 3 | from tensorflow.keras.models import Model 4 | from tensorflow.keras.layers import Activation, Add, Conv2D, Input, Lambda, Layer 5 | from tensorflow.keras import backend as K 6 | 7 | 8 | def residual_conv_block( 9 | input_layer: Layer, kernel_sizes: List[int], num_filters: List[int], dilation_rates: List[int], activation: str, 10 | ) -> Layer: 11 | """Instantiate a Residual convolutional block.""" 12 | padding = "same" 13 | x = Conv2D( 14 | num_filters[0], 15 | kernel_size=kernel_sizes[0], 16 | dilation_rate=dilation_rates[0], 17 | padding=padding, 18 | activation=activation, 19 | )(input_layer) 20 | x = Conv2D(num_filters[1], kernel_size=kernel_sizes[1], dilation_rate=dilation_rates[1], padding=padding,)(x) 21 | y = Conv2D(num_filters[1], kernel_size=1, dilation_rate=1, padding=padding)(input_layer) 22 | x = Add()([x, y]) 23 | x = Activation(activation)(x) 24 | return x 25 | 26 | 27 | def fcn(_input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> Model: 28 | """Instantiate a fully convolutional residual network for line detection.""" 29 | num_filters = [16] * 14 30 | kernel_sizes = [7] * 14 31 | dilation_rates = [3] * 4 + [7] * 10 32 | 33 | num_classes = output_shape[-1] 34 | input_image = Input((None, None)) 35 | model_layer = Lambda(lambda x: K.expand_dims(x, axis=-1))(input_image) 36 | 37 | for i in range(0, len(num_filters), 2): 38 | model_layer = residual_conv_block( 39 | input_layer=model_layer, 40 | kernel_sizes=kernel_sizes[i : i + 2], 41 | num_filters=num_filters[i : i + 2], 42 | dilation_rates=dilation_rates[i : i + 2], 43 | activation="relu", 44 | ) 45 | output = Conv2D(num_classes, kernel_size=1, dilation_rate=1, padding="same", activation="softmax")(model_layer) 46 | 47 | model = Model(inputs=input_image, outputs=output) 48 | return model 49 | -------------------------------------------------------------------------------- /text_recognizer/networks/lenet.py: -------------------------------------------------------------------------------- 1 | """LeNet network.""" 2 | from typing import Tuple 3 | 4 | import tensorflow as tf 5 | from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, Lambda, MaxPooling2D 6 | from tensorflow.keras.models import Sequential, Model 7 | 8 | 9 | def lenet(input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> Model: 10 | """Return LeNet Keras model.""" 11 | num_classes = output_shape[0] 12 | 13 | # Your code below (Lab 2) 14 | model = Sequential() 15 | if len(input_shape) < 3: 16 | model.add(Lambda(lambda x: tf.expand_dims(x, -1), input_shape=input_shape, name='expand_dims')) 17 | input_shape = (input_shape[0], input_shape[1], 1) 18 | model.add(Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape, padding="valid")) 19 | model.add(Conv2D(64, (3, 3), activation="relu", padding="valid")) 20 | model.add(MaxPooling2D(pool_size=(2, 2), padding="valid")) 21 | model.add(Dropout(0.2)) 22 | model.add(Flatten()) 23 | model.add(Dense(128, activation="relu")) 24 | model.add(Dropout(0.2)) 25 | model.add(Dense(num_classes, activation="softmax")) 26 | # Your code above (Lab 2) 27 | 28 | return model 29 | -------------------------------------------------------------------------------- /text_recognizer/networks/line_cnn_all_conv.py: -------------------------------------------------------------------------------- 1 | """CNN-based model for recognizing handwritten text.""" 2 | from typing import Tuple 3 | 4 | import tensorflow as tf 5 | from tensorflow.keras.layers import Conv2D, Dropout, MaxPooling2D, Reshape, Lambda, Permute 6 | from tensorflow.keras.models import Sequential 7 | from tensorflow.keras.models import Model as KerasModel 8 | 9 | 10 | def line_cnn_all_conv( 11 | input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], window_width: float = 28, window_stride: float = 14, 12 | ) -> KerasModel: 13 | image_height, image_width = input_shape 14 | output_length, num_classes = output_shape 15 | # Current shape is: (image_height, image_width, 1) 16 | 17 | model = Sequential() 18 | model.add(Reshape((image_height, image_width, 1), input_shape=input_shape)) 19 | model.add(Conv2D(32, kernel_size=(3, 3), activation="relu", padding="same")) 20 | model.add(Conv2D(64, (3, 3), activation="relu", padding="same")) 21 | model.add(MaxPooling2D(pool_size=(2, 2), padding="same")) 22 | model.add(Dropout(0.2)) 23 | # Current shape is: (image_height // 2, image_width // 2, 64) 24 | 25 | # So far, this is the same as LeNet. At this point, LeNet would flatten and Dense 128. 26 | # Instead, we are going to use a Conv2D to slide over these outputs with window_width and window_stride, 27 | # and output softmax activations of shape (output_length, num_classes)./ 28 | 29 | # Because of MaxPooling, everything is divided by 2 30 | new_height = image_height // 2 31 | new_width = image_width // 2 32 | new_window_width = window_width // 2 33 | new_window_stride = window_stride // 2 34 | 35 | # Your code below (Lab 2) 36 | model.add( 37 | Conv2D(128, (new_height, new_window_width), (new_height, new_window_stride), activation="relu", padding="same") 38 | ) 39 | model.add(Dropout(0.2)) 40 | # Your code above (Lab 2) 41 | # Shape is now (1, num_windows, 128) 42 | num_windows = new_width // new_window_stride 43 | 44 | model.add(Permute((2, 1, 3))) # We could instead do a Reshape((num_windows, 1, 128)) 45 | # Shape is now (num_windows, 1, 128) 46 | 47 | final_classifier_width = num_windows // output_length 48 | model.add( 49 | Conv2D( 50 | num_classes, (final_classifier_width, 1), (final_classifier_width, 1), activation="softmax", padding="same" 51 | ) 52 | ) 53 | # Shape is now (output_length, 1, num_classes) 54 | 55 | model.add(Lambda(lambda x: tf.squeeze(x, 2))) # We could instead do a Reshape((output_length, num_classes)) 56 | # Shape is now (output_length, num_classes) 57 | 58 | # Since we floor'd the calculation of width, we might have too many items in the sequence. Take only output_length. 59 | # model.add(Lambda(lambda x: x[:, :output_length, :])) 60 | return model 61 | -------------------------------------------------------------------------------- /text_recognizer/networks/line_lstm_ctc.py: -------------------------------------------------------------------------------- 1 | """LSTM with CTC for handwritten text recognition within a line.""" 2 | from tensorflow.keras.layers import Dense, Input, Reshape, TimeDistributed, Lambda, LSTM 3 | from tensorflow.keras.models import Model as KerasModel 4 | import tensorflow.keras.backend as K 5 | 6 | from text_recognizer.networks.lenet import lenet 7 | from text_recognizer.networks.misc import slide_window 8 | from text_recognizer.networks.ctc import ctc_decode 9 | 10 | 11 | def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals 12 | image_height, image_width = input_shape 13 | output_length, num_classes = output_shape 14 | 15 | num_windows = int((image_width - window_width) / window_stride) + 1 16 | if num_windows < output_length: 17 | raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})") 18 | 19 | image_input = Input(shape=input_shape, name="image") 20 | y_true = Input(shape=(output_length,), name="y_true") 21 | input_length = Input(shape=(1,), name="input_length") 22 | label_length = Input(shape=(1,), name="label_length") 23 | 24 | # Your code should use slide_window and extract image patches from image_input. 25 | # Pass a convolutional model over each image patch to generate a feature vector per window. 26 | # Pass these features through one or more LSTM layers. 27 | # Convert the lstm outputs to softmax outputs. 28 | # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). 29 | 30 | # Your code below (Lab 3) 31 | image_reshaped = Reshape((image_height, image_width, 1))(image_input) 32 | # (image_height, image_width, 1) 33 | 34 | image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})( 35 | image_reshaped 36 | ) 37 | # (num_windows, image_height, window_width, 1) 38 | 39 | # Make a LeNet and get rid of the last two layers (softmax and dropout) 40 | convnet = lenet((image_height, window_width, 1), (num_classes,)) 41 | convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) 42 | convnet_outputs = TimeDistributed(convnet)(image_patches) 43 | # (num_windows, 128) 44 | 45 | lstm_output = LSTM(128, return_sequences=True)(convnet_outputs) 46 | # (num_windows, 128) 47 | 48 | softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output) 49 | # (num_windows, num_classes) 50 | # Your code above (Lab 3) 51 | 52 | input_length_processed = Lambda( 53 | lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows} 54 | )(input_length) 55 | 56 | ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")( 57 | [y_true, softmax_output, input_length_processed, label_length] 58 | ) 59 | 60 | ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")( 61 | [softmax_output, input_length_processed] 62 | ) 63 | 64 | model = KerasModel( 65 | inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output], 66 | ) 67 | return model 68 | -------------------------------------------------------------------------------- /text_recognizer/networks/misc.py: -------------------------------------------------------------------------------- 1 | """Misc neural network functionality.""" 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | 6 | def slide_window(image: np.ndarray, window_width: int, window_stride: int) -> np.ndarray: 7 | """ 8 | Parameters 9 | ---------- 10 | image 11 | (image_height, image_width, 1) input 12 | 13 | Returns 14 | ------- 15 | np.ndarray 16 | (num_windows, image_height, window_width, 1) output, where 17 | num_windows is floor((image_width - window_width) / window_stride) + 1 18 | """ 19 | kernel = [1, 1, window_width, 1] 20 | strides = [1, 1, window_stride, 1] 21 | patches = tf.image.extract_patches(image, kernel, strides, [1, 1, 1, 1], "VALID") 22 | patches = tf.transpose(patches, (0, 2, 1, 3)) 23 | patches = tf.expand_dims(patches, -1) 24 | return patches 25 | -------------------------------------------------------------------------------- /text_recognizer/networks/mlp.py: -------------------------------------------------------------------------------- 1 | """Define mlp network function.""" 2 | from typing import Tuple 3 | 4 | from tensorflow.keras.models import Model, Sequential 5 | from tensorflow.keras.layers import Dense, Dropout, Flatten 6 | 7 | 8 | def mlp( 9 | input_shape: Tuple[int, ...], 10 | output_shape: Tuple[int, ...], 11 | layer_size: int = 128, 12 | dropout_amount: float = 0.2, 13 | num_layers: int = 3, 14 | ) -> Model: 15 | """ 16 | Create a simple multi-layer perceptron: fully-connected layers with dropout between them, with softmax predictions. 17 | Creates num_layers layers. 18 | """ 19 | num_classes = output_shape[0] 20 | 21 | model = Sequential() 22 | # Don't forget to pass input_shape to the first layer of the model 23 | # Your code below (Lab 1) 24 | model.add(Flatten(input_shape=input_shape)) 25 | for _ in range(num_layers): 26 | model.add(Dense(layer_size, activation="relu")) 27 | model.add(Dropout(dropout_amount)) 28 | model.add(Dense(num_classes, activation="softmax")) 29 | # Your code above (Lab 1) 30 | 31 | return model 32 | -------------------------------------------------------------------------------- /text_recognizer/paragraph_text_recognizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes an image and returns all the text in it, by first segmenting the image with LineDetector, then extracting crops 3 | of the image corresponding to the line regions, and running each line region crop through LinePredictor. 4 | """ 5 | from typing import List, Tuple, Union 6 | import cv2 7 | import numpy as np 8 | from text_recognizer.datasets import IamLinesDataset 9 | from text_recognizer.models.line_detector_model import LineDetectorModel 10 | from text_recognizer.models.line_model_ctc import LineModelCtc 11 | import text_recognizer.util as util 12 | 13 | 14 | class ParagraphTextRecognizer: 15 | """Given an image of a single handwritten character, recognizes it.""" 16 | 17 | def __init__(self): 18 | self.line_detector_model = LineDetectorModel() 19 | self.line_detector_model.load_weights() 20 | self.line_predictor_model = LineModelCtc(dataset_cls=IamLinesDataset) 21 | self.line_predictor_model.load_weights() 22 | 23 | def predict(self, image_or_filename: Union[np.ndarray, str]): 24 | """ 25 | Take an image and return all the text in it. 26 | """ 27 | if isinstance(image_or_filename, str): 28 | image = util.read_image(image_or_filename, grayscale=True) 29 | else: 30 | image = image_or_filename 31 | 32 | line_region_crops = self._get_line_region_crops(image=image) 33 | print([a.shape for a in line_region_crops]) 34 | prepared_line_region_crops = [ 35 | self._prepare_image_for_line_predictor_model(image=crop) for crop in line_region_crops 36 | ] 37 | 38 | line_region_strings = [ 39 | self.line_predictor_model.predict_on_image(crop)[0] for crop in prepared_line_region_crops 40 | ] 41 | return " ".join(line_region_strings), line_region_crops 42 | 43 | def _get_line_region_crops(self, image: np.ndarray, min_crop_len_factor: float = 0.02) -> List[np.ndarray]: 44 | """Find all the line regions in square image and crop them out and return them.""" 45 | prepared_image, scale_down_factor = self._prepare_image_for_line_detector_model(image) 46 | line_segmentation = self.line_detector_model.predict_on_image(prepared_image) 47 | bounding_boxes_xywh = _find_line_bounding_boxes(line_segmentation) 48 | 49 | bounding_boxes_xywh = (bounding_boxes_xywh * scale_down_factor).astype(int) 50 | 51 | min_crop_length = int(min_crop_len_factor * min(image.shape[0], image.shape[1])) 52 | line_region_crops = [ 53 | image[y : y + h, x : x + w] 54 | for x, y, w, h in bounding_boxes_xywh 55 | if w >= min_crop_length and h >= min_crop_length 56 | ] 57 | return line_region_crops 58 | 59 | def _prepare_image_for_line_detector_model(self, image: np.ndarray) -> Tuple[np.ndarray, float]: 60 | """Convert uint8 image to float image with black background with shape self.line_detector_model.image_shape.""" 61 | resized_image, scale_down_factor = _resize_image_for_line_detector_model( 62 | image=image, max_shape=self.line_detector_model.image_shape 63 | ) 64 | resized_image = (1.0 - resized_image / 255).astype("float32") 65 | return resized_image, scale_down_factor 66 | 67 | def _prepare_image_for_line_predictor_model(self, image: np.ndarray) -> np.ndarray: 68 | """ 69 | Convert uint8 image to float image with black background with shape self.line_predictor_model.image_shape 70 | while maintaining the image aspect ratio. 71 | """ 72 | expected_shape = self.line_predictor_model.image_shape 73 | scale_factor = (np.array(expected_shape) / np.array(image.shape)).min() 74 | scaled_image = cv2.resize(image, dsize=None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_AREA) 75 | 76 | pad_width = ( 77 | (0, expected_shape[0] - scaled_image.shape[0]), 78 | (0, expected_shape[1] - scaled_image.shape[1]), 79 | ) 80 | padded_image = np.pad(scaled_image, pad_width=pad_width, mode="constant", constant_values=255) 81 | return 1 - padded_image / 255 82 | 83 | 84 | def _find_line_bounding_boxes(line_segmentation: np.ndarray): 85 | """Given a line segmentation, find bounding boxes for connected-component regions corresponding to non-0 labels.""" 86 | 87 | def _find_line_bounding_boxes_in_channel(line_segmentation_channel: np.ndarray) -> np.ndarray: 88 | line_activation_image = cv2.dilate(line_segmentation_channel, kernel=np.ones((3, 3)), iterations=1) 89 | line_activation_image = (line_activation_image * 255).astype("uint8") 90 | line_activation_image = cv2.threshold(line_activation_image, 0.5, 1, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 91 | 92 | bounding_cnts, _ = cv2.findContours(line_activation_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 93 | return np.array([cv2.boundingRect(cnt) for cnt in bounding_cnts]) 94 | 95 | bboxes_xywh = np.concatenate( 96 | [_find_line_bounding_boxes_in_channel(line_segmentation[:, :, i]) for i in [1, 2]], axis=0 97 | ) 98 | return bboxes_xywh[np.argsort(bboxes_xywh[:, 1])] 99 | 100 | 101 | def _resize_image_for_line_detector_model(image: np.ndarray, max_shape: Tuple[int, int]) -> Tuple[np.ndarray, float]: 102 | """Resize the image to less than the max_shape while maintaining aspect ratio.""" 103 | scale_down_factor = max(np.array(image.shape) / np.array(max_shape)) 104 | if scale_down_factor == 1: 105 | return image.copy(), scale_down_factor 106 | resized_image = cv2.resize( 107 | image, dsize=None, fx=1 / scale_down_factor, fy=1 / scale_down_factor, interpolation=cv2.INTER_AREA, 108 | ) 109 | return resized_image, scale_down_factor 110 | -------------------------------------------------------------------------------- /text_recognizer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/the-full-stack/fsdl-text-recognizer/a99a3d3f0594dfceb249a56e8362337f9e12897e/text_recognizer/tests/__init__.py -------------------------------------------------------------------------------- /text_recognizer/tests/support/create_emnist_lines_support_files.py: -------------------------------------------------------------------------------- 1 | """Module for creating EMNIST Lines test support files.""" 2 | from pathlib import Path 3 | import shutil 4 | 5 | import numpy as np 6 | 7 | from text_recognizer.datasets import EmnistLinesDataset 8 | import text_recognizer.util as util 9 | 10 | 11 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "emnist_lines" 12 | 13 | 14 | def create_emnist_lines_support_files(): 15 | shutil.rmtree(SUPPORT_DIRNAME, ignore_errors=True) 16 | SUPPORT_DIRNAME.mkdir() 17 | 18 | dataset = EmnistLinesDataset() 19 | dataset.load_or_generate_data() 20 | 21 | for ind in [0, 1, 3]: 22 | image = dataset.x_test[ind] 23 | print(image.sum(), image.dtype) 24 | label = "".join(dataset.mapping[label] for label in np.argmax(dataset.y_test[ind], axis=-1).flatten()).strip( 25 | " _" 26 | ) 27 | print(label) 28 | util.write_image(image, str(SUPPORT_DIRNAME / f"{label}.png")) 29 | # Hide lines below until Lab 8 30 | # Inverted version 31 | image = -(image - 255) 32 | util.write_image(image, str(SUPPORT_DIRNAME / f"{label}_i.png")) 33 | # Hide lines above until Lab 8 34 | 35 | 36 | if __name__ == "__main__": 37 | create_emnist_lines_support_files() 38 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/create_emnist_support_files.py: -------------------------------------------------------------------------------- 1 | """Module for creating EMNIST test support files.""" 2 | from pathlib import Path 3 | import shutil 4 | 5 | import numpy as np 6 | 7 | from text_recognizer.datasets import EmnistDataset 8 | import text_recognizer.util as util 9 | 10 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "emnist" 11 | 12 | 13 | def create_emnist_support_files(): 14 | shutil.rmtree(SUPPORT_DIRNAME, ignore_errors=True) 15 | SUPPORT_DIRNAME.mkdir() 16 | 17 | dataset = EmnistDataset() 18 | dataset.load_or_generate_data() 19 | 20 | for ind in [5, 7, 9]: 21 | image = dataset.x_test[ind] 22 | label = dataset.mapping[np.argmax(dataset.y_test[ind])] 23 | print(ind, label) 24 | util.write_image(image, str(SUPPORT_DIRNAME / f"{label}.png")) 25 | 26 | 27 | if __name__ == "__main__": 28 | create_emnist_support_files() 29 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/create_iam_lines_support_files.py: -------------------------------------------------------------------------------- 1 | """Module for creating IAM Lines test support files.""" 2 | from pathlib import Path 3 | import shutil 4 | 5 | import numpy as np 6 | 7 | from text_recognizer.datasets import IamLinesDataset 8 | import text_recognizer.util as util 9 | 10 | 11 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "iam_lines" 12 | 13 | 14 | def create_iam_lines_support_files(): 15 | shutil.rmtree(SUPPORT_DIRNAME, ignore_errors=True) 16 | SUPPORT_DIRNAME.mkdir() 17 | 18 | dataset = IamLinesDataset() 19 | dataset.load_or_generate_data() 20 | 21 | for ind in [0, 1, 3]: 22 | image = dataset.x_test[ind] 23 | label = "".join(dataset.mapping[label] for label in np.argmax(dataset.y_test[ind], axis=-1).flatten()).strip( 24 | " _" 25 | ) 26 | print(label) 27 | util.write_image(image, str(SUPPORT_DIRNAME / f"{label}.png")) 28 | 29 | 30 | if __name__ == "__main__": 31 | create_iam_lines_support_files() 32 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist/8.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:455c3788a677a33583aec467f49d1917d0b34c0785b3eee6867699f0d2ffbc1a 3 | size 498 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist/U.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6c1490758a7d28fde2a2e0bdc0a644c19a828500901f4417f205def23c2ad3d5 3 | size 524 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist/e.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:001a7679be1c0c622354aebcbdcc0f2e992e1fc3295ee1d6fef1c1dd1613508e 3 | size 563 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist_lines/Corsi left for.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2e6d26a81d593d7d37d9496763104717e24aa3885cfc993c685eedd29b02ce1f 3 | size 3763 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist_lines/do that In.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:42bff01a0fd9f03726f12069f19646374f239642ea51819d0687359712d45eb7 3 | size 2888 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/emnist_lines/or if used the results.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bea915331082580d7aaf129da096afd600a15eca4fa562fe78fb57c4f8e5a199 3 | size 5645 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/iam_lines/He rose from his breakfast-nook bench.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8841d95a0008748d5c557061ea59dac2e46a221e30b9c6e9fc6ceac16827094f 3 | size 4876 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/iam_lines/and came into the livingroom, where.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d2890307c91b9a25f2bec15fcbc15995d824a1c4d991a29f982ad4c09a9a1e6a 3 | size 3437 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/iam_lines/his entrance. He came, almost falling.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:95f159bccf1acebb1c48eeeb5773748032dca76969695be2141b9fc8b28013c2 3 | size 3600 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/support/iam_paragraphs/a01-000u-cropped.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:80d99e88bbb1f9be0f6a24ef75d6d2178043a0c1feba2b91e94c349b6b042bdd 3 | size 144556 4 | -------------------------------------------------------------------------------- /text_recognizer/tests/test_character_predictor.py: -------------------------------------------------------------------------------- 1 | """Tests for CharacterPredictor class.""" 2 | import os 3 | from pathlib import Path 4 | import unittest 5 | 6 | from text_recognizer.character_predictor import CharacterPredictor 7 | 8 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" / "emnist" 9 | 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 11 | 12 | 13 | class TestCharacterPredictor(unittest.TestCase): 14 | """Tests for the CharacterPredictor class.""" 15 | 16 | def test_filename(self): 17 | """Test that CharacterPredictor correctly predicts on a single image, for serveral test images.""" 18 | predictor = CharacterPredictor() 19 | 20 | for filename in SUPPORT_DIRNAME.glob("*.png"): 21 | pred, conf = predictor.predict(str(filename)) 22 | print(f"Prediction: {pred} at confidence: {conf} for image with character {filename.stem}") 23 | self.assertEqual(pred, filename.stem) 24 | self.assertGreater(conf, 0.7) 25 | -------------------------------------------------------------------------------- /text_recognizer/tests/test_line_predictor.py: -------------------------------------------------------------------------------- 1 | """Tests for LinePredictor class.""" 2 | import os 3 | from pathlib import Path 4 | import unittest 5 | 6 | import editdistance 7 | import numpy as np 8 | 9 | import text_recognizer.util as util 10 | from text_recognizer.line_predictor import LinePredictor 11 | 12 | # Hide lines below until Lab 4 13 | from text_recognizer.datasets import IamLinesDataset 14 | 15 | # Hide lines above until Lab 4 16 | 17 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" 18 | 19 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 20 | 21 | 22 | class TestEmnistLinePredictor(unittest.TestCase): 23 | """Test LinePredictor class on the EmnistLines dataset.""" 24 | 25 | def test_filename(self): 26 | """Test that LinePredictor correctly predicts on single images, for several test images.""" 27 | predictor = LinePredictor() 28 | 29 | for filename in (SUPPORT_DIRNAME / "emnist_lines").glob("*.png"): 30 | pred, conf = predictor.predict(str(filename)) 31 | true = str(filename.stem) 32 | edit_distance = editdistance.eval(pred, true) / len(pred) 33 | print(f'Pred: "{pred}" | Confidence: {conf} | True: {true} | Edit distance: {edit_distance}') 34 | self.assertLess(edit_distance, 0.2) 35 | 36 | 37 | class TestEmnistLinePredictorVariableImageWidth(unittest.TestCase): 38 | """Test LinePredictor class on the EmnistLines dataset, with variable images.""" 39 | 40 | def test_filename(self): 41 | """Test that LinePredictor correctly predicts on single images, for several test images.""" 42 | predictor = LinePredictor() 43 | for filename in SUPPORT_DIRNAME.glob("*.png"): 44 | image = util.read_image(str(filename), grayscale=True) 45 | print("Saved image shape:", image.shape) 46 | image = image[:, : -np.random.randint(1, 150)] # pylint: disable=invalid-unary-operand-type 47 | print("Randomly cropped image shape:", image.shape) 48 | pred, conf = predictor.predict(image) 49 | true = str(filename.stem) 50 | edit_distance = editdistance.eval(pred, true) / len(pred) 51 | print(f'Pred: "{pred}" | Confidence: {conf} | True: {true} | Edit distance: {edit_distance}') 52 | self.assertLess(edit_distance, 0.2) 53 | 54 | 55 | # Hide lines below until Lab 4 56 | class TestIamLinePredictor(unittest.TestCase): 57 | """Test LinePredictor class on the IamLines dataset, with variable images.""" 58 | 59 | def test_filename(self): # pylint: disable=R0201 60 | """Test that LinePredictor correctly predicts on single images, for several test images.""" 61 | predictor = LinePredictor(IamLinesDataset) 62 | 63 | for filename in (SUPPORT_DIRNAME / "iam_lines").glob("*.png"): 64 | pred, conf = predictor.predict(str(filename)) 65 | true = filename.stem 66 | if pred: 67 | edit_distance = editdistance.eval(pred, true) / len(pred) 68 | else: 69 | edit_distance = 0 70 | print(f'Pred: "{pred}" | Confidence: {conf} | True: {true} | Edit distance: {edit_distance}') 71 | # self.assertLess(edit_distance, 0.2) 72 | 73 | 74 | # Hide lines above until Lab 4 75 | -------------------------------------------------------------------------------- /text_recognizer/tests/test_paragraph_text_recognizer.py: -------------------------------------------------------------------------------- 1 | """Tests for ParagraphTextRecognizer class.""" 2 | import os 3 | from pathlib import Path 4 | import unittest 5 | from text_recognizer.paragraph_text_recognizer import ParagraphTextRecognizer 6 | import text_recognizer.util as util 7 | 8 | 9 | SUPPORT_DIRNAME = Path(__file__).parents[0].resolve() / "support" / "iam_paragraphs" 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 12 | 13 | 14 | class TestParagraphTextRecognizer(unittest.TestCase): 15 | """Test that it can take non-square images of max dimension larger than 256px.""" 16 | 17 | def test_filename(self): # pylint: disable=R0201 18 | predictor = ParagraphTextRecognizer() 19 | num_text_lines_by_name = {"a01-000u-cropped": 7} 20 | for filename in (SUPPORT_DIRNAME).glob("*.jpg"): 21 | full_image = util.read_image(str(filename), grayscale=True) 22 | predicted_text, line_region_crops = predictor.predict(full_image) 23 | print(predicted_text) 24 | assert len(line_region_crops) == num_text_lines_by_name[filename.stem] 25 | -------------------------------------------------------------------------------- /text_recognizer/util.py: -------------------------------------------------------------------------------- 1 | """Utility functions for text_recognizer module.""" 2 | # Hide lines below until Lab 8 3 | import base64 4 | 5 | # Hide lines above until Lab 8 6 | from concurrent.futures import as_completed, ThreadPoolExecutor 7 | from pathlib import Path 8 | from typing import Union 9 | from urllib.request import urlopen, urlretrieve 10 | import hashlib 11 | import os 12 | 13 | import numpy as np 14 | import cv2 15 | from tqdm import tqdm 16 | 17 | 18 | def read_image(image_uri: Union[Path, str], grayscale=False) -> np.array: 19 | """Read image_uri.""" 20 | 21 | def read_image_from_filename(image_filename, imread_flag): 22 | return cv2.imread(str(image_filename), imread_flag) 23 | 24 | def read_image_from_url(image_url, imread_flag): 25 | url_response = urlopen(str(image_url)) # nosec 26 | img_array = np.array(bytearray(url_response.read()), dtype=np.uint8) 27 | return cv2.imdecode(img_array, imread_flag) 28 | 29 | imread_flag = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR 30 | local_file = os.path.exists(image_uri) 31 | try: 32 | img = None 33 | if local_file: 34 | img = read_image_from_filename(image_uri, imread_flag) 35 | else: 36 | img = read_image_from_url(image_uri, imread_flag) 37 | assert img is not None 38 | except Exception as e: 39 | raise ValueError("Could not load image at {}: {}".format(image_uri, e)) 40 | return img 41 | 42 | 43 | # Hide lines below until Lab 8 44 | def read_b64_image(b64_string, grayscale=False): 45 | """Load base64-encoded images.""" 46 | imread_flag = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR 47 | try: 48 | _, b64_data = b64_string.split(",") 49 | return cv2.imdecode(np.frombuffer(base64.b64decode(b64_data), np.uint8), imread_flag) 50 | except Exception as e: 51 | raise ValueError("Could not load image from b64 {}: {}".format(b64_string, e)) 52 | 53 | 54 | # Hide lines above until Lab 8 55 | def write_image(image: np.ndarray, filename: Union[Path, str]) -> None: 56 | """Write image to file.""" 57 | cv2.imwrite(str(filename), image) 58 | 59 | 60 | def compute_sha256(filename: Union[Path, str]): 61 | """Return SHA256 checksum of a file.""" 62 | with open(filename, "rb") as f: 63 | return hashlib.sha256(f.read()).hexdigest() 64 | 65 | 66 | class TqdmUpTo(tqdm): 67 | """From https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py""" 68 | 69 | def update_to(self, blocks=1, bsize=1, tsize=None): 70 | """ 71 | Parameters 72 | ---------- 73 | blocks : int, optional 74 | Number of blocks transferred so far [default: 1]. 75 | bsize : int, optional 76 | Size of each block (in tqdm units) [default: 1]. 77 | tsize : int, optional 78 | Total size (in tqdm units). If [default: None] remains unchanged. 79 | """ 80 | if tsize is not None: 81 | self.total = tsize # pylint: disable=attribute-defined-outside-init 82 | self.update(blocks * bsize - self.n) # will also set self.n = b * bsize 83 | 84 | 85 | def download_url(url, filename): 86 | """Download a file from url to filename, with a progress bar.""" 87 | with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t: 88 | urlretrieve(url, filename, reporthook=t.update_to, data=None) # nosec 89 | 90 | 91 | # Hide lines below until Lab 6 92 | def download_urls(urls, filenames): 93 | """Download urls to filenames in a multi-threaded way.""" 94 | with ThreadPoolExecutor() as executor: 95 | futures = [executor.submit(urlretrieve, url, filename) for url, filename in zip(urls, filenames)] 96 | for future in tqdm(as_completed(futures), total=len(futures)): 97 | try: 98 | future.result() 99 | except Exception as e: # pylint: disable=broad-except 100 | print("Error", e) 101 | 102 | 103 | # Hide lines above until Lab 6 104 | -------------------------------------------------------------------------------- /text_recognizer/weights/CharacterModel_EmnistDataset_mlp_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9f4d3191391db8f3ba58c70e0e9578be68632e2cfc952794a5c81d735ccab530 3 | size 595520 4 | -------------------------------------------------------------------------------- /text_recognizer/weights/LineDetectorModel_IamParagraphsDataset_fcn_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a4da991d0accee3b999ba55d5c6a2fde4150112bf935c4c10e237e2065427ab6 3 | size 745984 4 | -------------------------------------------------------------------------------- /text_recognizer/weights/LineModelCtc_EmnistLinesDataset_line_lstm_ctc_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b2befc73c19e5f30a6eb5c83ad4c9261b64b3fe23cb8e64e7e4c9b13dce863db 3 | size 2243720 4 | -------------------------------------------------------------------------------- /text_recognizer/weights/LineModelCtc_IamLinesDataset_line_lstm_ctc_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1948f55bbeb7a98b7ef643967c54000c63fb4c5fff32a1115cb7cd0e9e8da0e4 3 | size 2243720 4 | -------------------------------------------------------------------------------- /training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/the-full-stack/fsdl-text-recognizer/a99a3d3f0594dfceb249a56e8362337f9e12897e/training/__init__.py -------------------------------------------------------------------------------- /training/experiments/cnn.json: -------------------------------------------------------------------------------- 1 | { 2 | "experiment_group": "Line CNN", 3 | "experiments": [ 4 | { 5 | "dataset": "EmnistLinesDataset", 6 | "model": "LineModel", 7 | "network": "line_cnn_all_conv", 8 | "network_args": { 9 | "window_width": 14, 10 | "window_stride": 14 11 | } 12 | }, 13 | { 14 | "dataset": "EmnistLinesDataset", 15 | "model": "LineModel", 16 | "network": "line_cnn_all_conv", 17 | "network_args": { 18 | "window_width": 7, 19 | "window_stride": 7 20 | } 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /training/experiments/lstm_ctc.json: -------------------------------------------------------------------------------- 1 | { 2 | "experiment_group": "LineCtc2", 3 | "experiments": [ 4 | { 5 | "dataset": "EmnistLinesDataset", 6 | "model": "LineModelCtc", 7 | "network": "line_lstm_ctc", 8 | "network_args": { 9 | "window_width": 28, 10 | "window_stride": 14 11 | } 12 | }, 13 | { 14 | "dataset": "EmnistLinesDataset", 15 | "model": "LineModelCtc", 16 | "network": "line_lstm_ctc", 17 | "network_args": { 18 | "window_width": 14, 19 | "window_stride": 7 20 | } 21 | }, 22 | { 23 | "dataset": "EmnistLinesDataset", 24 | "dataset_args": { 25 | "max_length": 34 26 | }, 27 | "model": "LineModelCtc", 28 | "network": "line_lstm_ctc", 29 | "network_args": { 30 | "window_width": 28, 31 | "window_stride": 14 32 | } 33 | }, 34 | { 35 | "dataset": "EmnistLinesDataset", 36 | "dataset_args": { 37 | "max_length": 34 38 | }, 39 | "model": "LineModelCtc", 40 | "network": "line_lstm_ctc", 41 | "network_args": { 42 | "window_width": 14, 43 | "window_stride": 7 44 | } 45 | }, 46 | { 47 | "dataset": "IamLinesDataset", 48 | "model": "LineModelCtc", 49 | "network": "line_lstm_ctc", 50 | "network_args": { 51 | "window_width": 28, 52 | "window_stride": 14 53 | } 54 | }, 55 | { 56 | "dataset": "IamLinesDataset", 57 | "model": "LineModelCtc", 58 | "network": "line_lstm_ctc", 59 | "network_args": { 60 | "window_width": 14, 61 | "window_stride": 7 62 | } 63 | } 64 | ] 65 | } 66 | -------------------------------------------------------------------------------- /training/experiments/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "experiment_group": "Sample Experiments", 3 | "experiments": [ 4 | { 5 | "dataset": "EmnistDataset", 6 | "model": "CharacterModel", 7 | "network": "mlp", 8 | "network_args": { 9 | "num_layers": 2 10 | }, 11 | "train_args": { 12 | "batch_size": 256 13 | } 14 | }, 15 | { 16 | "dataset": "EmnistDataset", 17 | "model": "CharacterModel", 18 | "network": "mlp", 19 | "network_args": { 20 | "num_layers": 4 21 | }, 22 | "train_args": { 23 | "batch_size": 256 24 | } 25 | }, 26 | { 27 | "dataset": "EmnistDataset", 28 | "model": "CharacterModel", 29 | "network": "lenet", 30 | "train_args": { 31 | "batch_size": 128 32 | } 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /training/gpu_manager.py: -------------------------------------------------------------------------------- 1 | """GPUManager class.""" 2 | import os 3 | import time 4 | 5 | import gpustat 6 | import numpy as np 7 | from redlock import Redlock 8 | 9 | 10 | GPU_LOCK_TIMEOUT = 5000 # ms 11 | 12 | 13 | class GPUManager: 14 | """Class for allocating GPUs.""" 15 | 16 | def __init__(self, verbose: bool = False): 17 | self.lock_manager = Redlock([{"host": "localhost", "port": 6379, "db": 0}]) 18 | self.verbose = verbose 19 | 20 | def get_free_gpu(self): 21 | """ 22 | If some GPUs are available, try reserving one by checking out an exclusive redis lock. 23 | If none available or can't get lock, sleep and check again. 24 | """ 25 | while True: 26 | gpu_ind = self._get_free_gpu() 27 | if gpu_ind is not None: 28 | return gpu_ind 29 | if self.verbose: 30 | print(f"pid {os.getpid()} sleeping") 31 | time.sleep(GPU_LOCK_TIMEOUT / 1000) 32 | 33 | def _get_free_gpu(self): 34 | try: 35 | available_gpu_inds = [ 36 | gpu.index for gpu in gpustat.GPUStatCollection.new_query() if gpu.memory_used < 0.5 * gpu.memory_total 37 | ] 38 | except Exception: # pylint: disable=broad-except 39 | return [0] # Return dummy GPU index if no CUDA GPUs are installed 40 | 41 | if available_gpu_inds: 42 | gpu_ind = np.random.choice(available_gpu_inds) 43 | if self.verbose: 44 | print(f"pid {os.getpid()} picking gpu {gpu_ind}") 45 | if self.lock_manager.lock(f"gpu_{gpu_ind}", GPU_LOCK_TIMEOUT): 46 | return int(gpu_ind) 47 | if self.verbose: 48 | print(f"pid {os.getpid()} couldnt get lock") 49 | return None 50 | -------------------------------------------------------------------------------- /training/prepare_experiments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple way to run experiments defined in a file.""" 3 | import argparse 4 | import json 5 | 6 | 7 | def run_experiments(experiments_filename): 8 | """Run experiments from file.""" 9 | with open(experiments_filename) as f: 10 | experiments_config = json.load(f) 11 | num_experiments = len(experiments_config["experiments"]) 12 | for ind in range(num_experiments): 13 | experiment_config = experiments_config["experiments"][ind] 14 | experiment_config["experiment_group"] = experiments_config["experiment_group"] 15 | print(f"python training/run_experiment.py --gpu=-1 '{json.dumps(experiment_config)}'") 16 | 17 | 18 | def main(): 19 | """Parse command-line arguments and run experiments from provided file.""" 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("experiments_filename", type=str, help="Filename of JSON file of experiments to run.") 22 | args = parser.parse_args() 23 | run_experiments(args.experiments_filename) 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /training/run_experiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Script to run an experiment.""" 3 | import argparse 4 | import json 5 | import importlib 6 | from typing import Dict 7 | import os 8 | 9 | # Hide lines below until Lab 3 10 | import wandb 11 | 12 | from training.gpu_manager import GPUManager 13 | 14 | # Hide lines above until Lab 3 15 | from training.util import train_model 16 | 17 | DEFAULT_TRAIN_ARGS = {"batch_size": 64, "epochs": 16} 18 | 19 | 20 | def run_experiment(experiment_config: Dict, save_weights: bool, gpu_ind: int, use_wandb: bool = True): 21 | """ 22 | Run a training experiment. 23 | 24 | Parameters 25 | ---------- 26 | experiment_config (dict) 27 | Of the form 28 | { 29 | "dataset": "EmnistLinesDataset", 30 | "dataset_args": { 31 | "max_overlap": 0.4, 32 | "subsample_fraction": 0.2 33 | }, 34 | "model": "LineModel", 35 | "network": "line_cnn_all_conv", 36 | "network_args": { 37 | "window_width": 14, 38 | "window_stride": 7 39 | }, 40 | "train_args": { 41 | "batch_size": 128, 42 | "epochs": 10 43 | } 44 | } 45 | save_weights (bool) 46 | If True, will save the final model weights to a canonical location (see Model in models/base.py) 47 | gpu_ind (int) 48 | specifies which gpu to use (or -1 for first available) 49 | use_wandb (bool) 50 | sync training run to wandb 51 | """ 52 | print(f"Running experiment with config {experiment_config} on GPU {gpu_ind}") 53 | 54 | datasets_module = importlib.import_module("text_recognizer.datasets") 55 | dataset_class_ = getattr(datasets_module, experiment_config["dataset"]) 56 | dataset_args = experiment_config.get("dataset_args", {}) 57 | dataset = dataset_class_(**dataset_args) 58 | dataset.load_or_generate_data() 59 | print(dataset) 60 | 61 | models_module = importlib.import_module("text_recognizer.models") 62 | model_class_ = getattr(models_module, experiment_config["model"]) 63 | 64 | networks_module = importlib.import_module("text_recognizer.networks") 65 | network_fn_ = getattr(networks_module, experiment_config["network"]) 66 | network_args = experiment_config.get("network_args", {}) 67 | model = model_class_( 68 | dataset_cls=dataset_class_, network_fn=network_fn_, dataset_args=dataset_args, network_args=network_args, 69 | ) 70 | print(model) 71 | 72 | experiment_config["train_args"] = { 73 | **DEFAULT_TRAIN_ARGS, 74 | **experiment_config.get("train_args", {}), 75 | } 76 | experiment_config["experiment_group"] = experiment_config.get("experiment_group", None) 77 | experiment_config["gpu_ind"] = gpu_ind 78 | 79 | # Hide lines below until Lab 3 80 | if use_wandb: 81 | wandb.init(config=experiment_config) 82 | # Hide lines above until Lab 3 83 | 84 | train_model( 85 | model, 86 | dataset, 87 | epochs=experiment_config["train_args"]["epochs"], 88 | batch_size=experiment_config["train_args"]["batch_size"], 89 | use_wandb=use_wandb, 90 | ) 91 | score = model.evaluate(dataset.x_test, dataset.y_test) 92 | print(f"Test evaluation: {score}") 93 | 94 | # Hide lines below until Lab 3 95 | if use_wandb: 96 | wandb.log({"test_metric": score}) 97 | # Hide lines above until Lab 3 98 | 99 | if save_weights: 100 | model.save_weights() 101 | 102 | 103 | def _parse_args(): 104 | """Parse command-line arguments.""" 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument("--gpu", type=int, default=0, help="Provide index of GPU to use.") 107 | parser.add_argument( 108 | "--save", 109 | default=False, 110 | dest="save", 111 | action="store_true", 112 | help="If true, then final weights will be saved to canonical, version-controlled location", 113 | ) 114 | parser.add_argument( 115 | "experiment_config", 116 | type=str, 117 | help='Experimenet JSON (\'{"dataset": "EmnistDataset", "model": "CharacterModel", "network": "mlp"}\'', 118 | ) 119 | parser.add_argument( 120 | "--nowandb", default=False, action="store_true", help="If true, do not use wandb for this run", 121 | ) 122 | args = parser.parse_args() 123 | return args 124 | 125 | 126 | def main(): 127 | """Run experiment.""" 128 | args = _parse_args() 129 | # Hide lines below until Lab 3 130 | if args.gpu < 0: 131 | gpu_manager = GPUManager() 132 | args.gpu = gpu_manager.get_free_gpu() # Blocks until one is available 133 | # Hide lines above until Lab 3 134 | 135 | experiment_config = json.loads(args.experiment_config) 136 | os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.gpu}" 137 | run_experiment(experiment_config, args.save, args.gpu, use_wandb=not args.nowandb) 138 | 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /training/run_sweep.py: -------------------------------------------------------------------------------- 1 | """W&B Sweep Functionality.""" 2 | import os 3 | import signal 4 | import subprocess 5 | import sys 6 | import json 7 | from typing import Tuple 8 | from ast import literal_eval 9 | 10 | DEFAULT_CONFIG = { 11 | "dataset": "IamLinesDataset", 12 | "dataset_args": {"subsample_fraction": 0.33}, 13 | "model": "LineModel", 14 | "network": "line_lstm_ctc", 15 | "train_args": {"batch_size": 128, "epochs": 10}, 16 | } 17 | 18 | 19 | def args_to_json(default_config: dict, preserve_args: tuple = ("gpu", "save")) -> Tuple[dict, list]: 20 | """Convert command line arguments to nested config values 21 | 22 | i.e. run_sweep.py --dataset_args.foo=1.7 23 | 24 | { 25 | "dataset_args": { 26 | "foo": 1.7 27 | } 28 | } 29 | 30 | """ 31 | args = [] 32 | config = default_config.copy() 33 | key, val = None, None 34 | for arg in sys.argv[1:]: 35 | if "=" in arg: 36 | key, val = arg.split("=") 37 | elif key: 38 | val = arg 39 | else: 40 | key = arg 41 | if key and val: 42 | parsed_key = key.lstrip("-").split(".") 43 | if parsed_key[0] in preserve_args: 44 | args.append("--{}={}".format(parsed_key[0], val)) 45 | else: 46 | nested = config 47 | for level in parsed_key[:-1]: 48 | nested[level] = config.get(level, {}) 49 | nested = nested[level] 50 | try: 51 | # Convert numerics to floats / ints 52 | val = literal_eval(val) 53 | except ValueError: 54 | pass 55 | nested[parsed_key[-1]] = val 56 | key, val = None, None 57 | return config, args 58 | 59 | 60 | def main(): 61 | config, args = args_to_json(DEFAULT_CONFIG) 62 | env = {k: v for k, v in os.environ.items() if k not in ("WANDB_PROGRAM", "WANDB_ARGS")} 63 | # pylint: disable=subprocess-popen-preexec-fn 64 | run = subprocess.Popen( 65 | ["python", "training/run_experiment.py", *args, json.dumps(config)], env=env, preexec_fn=os.setsid, 66 | ) # nosec 67 | signal.signal(signal.SIGTERM, lambda *args: run.terminate()) 68 | run.wait() 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /training/sweep_emnist.yaml: -------------------------------------------------------------------------------- 1 | program: training/run_sweep.py 2 | method: grid 3 | metric: 4 | name: val_loss 5 | goal: minimize 6 | parameters: 7 | dataset: 8 | value: EmnistDataset 9 | model: 10 | value: CharacterModel 11 | network: 12 | value: mlp 13 | network_args.layer_size: 14 | values: [128, 256] 15 | network_args.dropout_amount: 16 | values: [0.2, 0.4] 17 | network_args.num_layers: 18 | values: [3, 6] 19 | train_args.batch_size: 20 | values: [64, 128] 21 | train_args.epochs: 22 | value: 5 23 | -------------------------------------------------------------------------------- /training/sweep_iam.yaml: -------------------------------------------------------------------------------- 1 | program: training/run_sweep.py 2 | method: grid 3 | metric: 4 | name: val_loss 5 | goal: minimize 6 | parameters: 7 | dataset: 8 | value: IamLinesDataset 9 | model: 10 | value: LineModelCtc 11 | network: 12 | value: line_lstm_ctc 13 | network_args.window_width: 14 | values: [14, 18] 15 | network_args.window_stride: 16 | values: [5, 7] # careful with these 17 | train_args.batch_size: 18 | values: [64, 128] 19 | -------------------------------------------------------------------------------- /training/update_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Update metadata.toml with SHA-256 hash of the current file.""" 3 | from pathlib import Path 4 | import argparse 5 | 6 | import toml 7 | 8 | from text_recognizer import util 9 | 10 | 11 | def _get_metadata_filename(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("filename", type=str, help="Path to the metadata.toml file to update.") 14 | args = parser.parse_args() 15 | return Path(args.filename).resolve() 16 | 17 | 18 | def main(): 19 | metadata_filename = _get_metadata_filename() 20 | 21 | metadata = toml.load(metadata_filename) 22 | 23 | data_filename = metadata_filename.parents[0] / metadata["filename"] 24 | supposed_data_sha256 = metadata["sha256"] 25 | actual_data_sha256 = util.compute_sha256(data_filename) 26 | 27 | if supposed_data_sha256 == actual_data_sha256: 28 | print("Nothing to update: SHA-256 matches") 29 | return 30 | 31 | print("Updating metadata SHA-256") 32 | metadata["sha256"] = actual_data_sha256 33 | with open(metadata_filename, "w") as f: 34 | toml.dump(metadata, f) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /training/util.py: -------------------------------------------------------------------------------- 1 | """Function to train a model.""" 2 | from time import time 3 | 4 | from tensorflow.keras.callbacks import EarlyStopping, Callback 5 | 6 | # Hide lines below until Lab 3 7 | import wandb 8 | from wandb.keras import WandbCallback 9 | 10 | # Hide lines above until Lab 3 11 | 12 | from text_recognizer.datasets.dataset import Dataset 13 | from text_recognizer.models.base import Model 14 | 15 | EARLY_STOPPING = True 16 | 17 | 18 | # Hide lines below until Lab 3 19 | class WandbImageLogger(Callback): 20 | """Custom callback for logging image predictions""" 21 | 22 | def __init__(self, model_wrapper: Model, dataset: Dataset, example_count: int = 4): 23 | super().__init__() 24 | self.model_wrapper = model_wrapper 25 | self.val_images = dataset.x_test[:example_count] # type: ignore 26 | 27 | def on_epoch_end(self, epoch, logs=None): 28 | images = [ 29 | wandb.Image(image, caption="{}: {}".format(*self.model_wrapper.predict_on_image(image))) 30 | for i, image in enumerate(self.val_images) 31 | ] 32 | wandb.log({"examples": images}, commit=False) 33 | 34 | 35 | # Hide lines above until Lab 3 36 | 37 | 38 | def train_model(model: Model, dataset: Dataset, epochs: int, batch_size: int, use_wandb: bool = False) -> Model: 39 | """Train model.""" 40 | callbacks = [] 41 | 42 | if EARLY_STOPPING: 43 | early_stopping = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=3, verbose=1, mode="auto") 44 | callbacks.append(early_stopping) 45 | 46 | # Hide lines below until Lab 3 47 | if use_wandb: 48 | image_callback = WandbImageLogger(model, dataset) 49 | wandb_callback = WandbCallback() 50 | callbacks.append(image_callback) 51 | callbacks.append(wandb_callback) 52 | # Hide lines above until Lab 3 53 | 54 | model.network.summary() 55 | 56 | t = time() 57 | _history = model.fit(dataset=dataset, batch_size=batch_size, epochs=epochs, callbacks=callbacks) 58 | print("Training took {:2f} s".format(time() - t)) 59 | 60 | return model 61 | -------------------------------------------------------------------------------- /wandb/settings: -------------------------------------------------------------------------------- 1 | [default] 2 | entity = fsdl 3 | project = fsdl-text-recognizer-nov2019 4 | base_url = https://api.wandb.ai 5 | 6 | --------------------------------------------------------------------------------