├── .gitignore ├── .pylintrc ├── README.md ├── lambda_function.py ├── package-lock.json ├── package.json ├── requirements ├── requirements-dev.in └── requirements.in ├── serverless.yml ├── tesseract-layer ├── Dockerfile ├── build.sh └── serverless.yml └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | 3 | tesseract-layer/layer 4 | 5 | ## Serverless 6 | node_modules 7 | **/.serverless 8 | 9 | *.png 10 | *.zip 11 | *.ipynb 12 | 13 | .vscode/ 14 | 15 | .envrc 16 | 17 | #pyenv 18 | tessenv/ 19 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [BASIC] 2 | 3 | # Good variable names which should always be accepted, separated by a comma 4 | good-names=i,j,k,ex,Run,_ 5 | 6 | # Bad variable names which should always be refused, separated by a comma 7 | bad-names=foo,bar,baz,toto,tutu,tata 8 | 9 | # Colon-delimited sets of names that determine each other's naming style when 10 | # the name regexes allow several styles. 11 | name-group= 12 | 13 | # Include a hint for the correct naming format with invalid-name 14 | include-naming-hint=no 15 | 16 | # List of decorators that produce properties, such as abc.abstractproperty. Add 17 | # to this list to register other decorators that produce valid properties. 18 | property-classes=abc.abstractproperty 19 | 20 | # Regular expression matching correct function names 21 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 22 | 23 | # Naming hint for function names 24 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 25 | 26 | # Regular expression matching correct variable names 27 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 28 | 29 | # Naming hint for variable names 30 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 31 | 32 | # Regular expression matching correct constant names 33 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 34 | 35 | # Naming hint for constant names 36 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 37 | 38 | # Regular expression matching correct attribute names 39 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 40 | 41 | # Naming hint for attribute names 42 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 43 | 44 | # Regular expression matching correct argument names 45 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 46 | 47 | # Naming hint for argument names 48 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 49 | 50 | # Regular expression matching correct class attribute names 51 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 52 | 53 | # Naming hint for class attribute names 54 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 55 | 56 | # Regular expression matching correct inline iteration names 57 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 58 | 59 | # Naming hint for inline iteration names 60 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 61 | 62 | # Regular expression matching correct class names 63 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 64 | 65 | # Naming hint for class names 66 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 67 | 68 | # Regular expression matching correct module names 69 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 70 | 71 | # Naming hint for module names 72 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 73 | 74 | # Regular expression matching correct method names 75 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 76 | 77 | # Naming hint for method names 78 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 79 | 80 | # Regular expression which should only match function or class names that do 81 | # not require a docstring. 82 | no-docstring-rgx=^_ 83 | 84 | # Minimum line length for functions/classes that require docstrings, shorter 85 | # ones are exempt. 86 | docstring-min-length=-1 87 | 88 | [ELIF] 89 | 90 | # Maximum number of nested blocks for function / method body 91 | max-nested-blocks=5 92 | 93 | 94 | [TYPECHECK] 95 | 96 | # Tells whether missing members accessed in mixin class should be ignored. A 97 | # mixin class is detected if its name ends with "mixin" (case insensitive). 98 | ignore-mixin-members=yes 99 | 100 | # List of module names for which member attributes should not be checked 101 | # (useful for modules/projects where namespaces are manipulated during runtime 102 | # and thus existing member attributes cannot be deduced by static analysis. It 103 | # supports qualified module names, as well as Unix pattern matching. 104 | ignored-modules= 105 | 106 | # List of class names for which member attributes should not be checked (useful 107 | # for classes with dynamically set attributes). This supports the use of 108 | # qualified names. 109 | ignored-classes=optparse.Values,thread._local,_thread._local 110 | 111 | # List of members which are set dynamically and missed by pylint inference 112 | # system, and so shouldn't trigger E1101 when accessed. Python regular 113 | # expressions are accepted. 114 | generated-members= 115 | 116 | # List of decorators that produce context managers, such as 117 | # contextlib.contextmanager. Add to this list to register other decorators that 118 | # produce valid context managers. 119 | contextmanager-decorators=contextlib.contextmanager 120 | 121 | 122 | [FORMAT] 123 | 124 | # Maximum number of characters on a single line. 125 | max-line-length=100 126 | 127 | # Regexp for a line that is allowed to be longer than the limit. 128 | ignore-long-lines=^\s*(# )??$ 129 | 130 | # Allow the body of an if to be on the same line as the test if there is no 131 | # else. 132 | single-line-if-stmt=no 133 | 134 | # List of optional constructs for which whitespace checking is disabled. `dict- 135 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 136 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 137 | # `empty-line` allows space-only lines. 138 | no-space-check=trailing-comma,dict-separator 139 | 140 | # Maximum number of lines in a module 141 | max-module-lines=1000 142 | 143 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 144 | # tab). 145 | # Use 2 spaces consistent with TensorFlow style. 146 | indent-string=' ' 147 | 148 | # Number of spaces of indent required inside a hanging or continued line. 149 | indent-after-paren=4 150 | 151 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 152 | expected-line-ending-format= 153 | 154 | 155 | [MISCELLANEOUS] 156 | 157 | # List of note tags to take in consideration, separated by a comma. 158 | notes=FIXME,XXX,TODO 159 | 160 | 161 | [VARIABLES] 162 | 163 | # Tells whether we should check for unused import in __init__ files. 164 | init-import=no 165 | 166 | # A regular expression matching the name of dummy variables (i.e. expectedly 167 | # not used). 168 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy 169 | 170 | # List of additional names supposed to be defined in builtins. Remember that 171 | # you should avoid to define new builtins when possible. 172 | additional-builtins= 173 | 174 | # List of strings which can identify a callback function by name. A callback 175 | # name must start or end with one of those strings. 176 | callbacks=cb_,_cb 177 | 178 | # List of qualified module names which can have objects that can redefine 179 | # builtins. 180 | redefining-builtins-modules=six.moves,future.builtins 181 | 182 | 183 | [LOGGING] 184 | 185 | # Logging modules to check that the string format arguments are in logging 186 | # function parameter format 187 | logging-modules=logging 188 | 189 | 190 | [SIMILARITIES] 191 | 192 | # Minimum lines number of a similarity. 193 | min-similarity-lines=4 194 | 195 | # Ignore comments when computing similarities. 196 | ignore-comments=yes 197 | 198 | # Ignore docstrings when computing similarities. 199 | ignore-docstrings=yes 200 | 201 | # Ignore imports when computing similarities. 202 | ignore-imports=no 203 | 204 | 205 | [SPELLING] 206 | 207 | # Spelling dictionary name. Available dictionaries: none. To make it working 208 | # install python-enchant package. 209 | spelling-dict= 210 | 211 | # List of comma separated words that should not be checked. 212 | spelling-ignore-words= 213 | 214 | # A path to a file that contains private dictionary; one word per line. 215 | spelling-private-dict-file= 216 | 217 | # Tells whether to store unknown words to indicated private dictionary in 218 | # --spelling-private-dict-file option instead of raising a message. 219 | spelling-store-unknown-words=no 220 | 221 | 222 | [IMPORTS] 223 | 224 | # Deprecated modules which should not be used, separated by a comma 225 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 226 | 227 | # Create a graph of every (i.e. internal and external) dependencies in the 228 | # given file (report RP0402 must not be disabled) 229 | import-graph= 230 | 231 | # Create a graph of external dependencies in the given file (report RP0402 must 232 | # not be disabled) 233 | ext-import-graph= 234 | 235 | # Create a graph of internal dependencies in the given file (report RP0402 must 236 | # not be disabled) 237 | int-import-graph= 238 | 239 | # Force import order to recognize a module as part of the standard 240 | # compatibility libraries. 241 | known-standard-library= 242 | 243 | # Force import order to recognize a module as part of a third party library. 244 | known-third-party=enchant 245 | 246 | # Analyse import fallback blocks. This can be used to support both Python 2 and 247 | # 3 compatible code, which means that the block might have code that exists 248 | # only in one or another interpreter, leading to false positives when analysed. 249 | analyse-fallback-blocks=no 250 | 251 | 252 | [DESIGN] 253 | 254 | # Maximum number of arguments for function / method 255 | max-args=7 256 | 257 | # Argument names that match this expression will be ignored. Default to name 258 | # with leading underscore 259 | ignored-argument-names=_.* 260 | 261 | # Maximum number of locals for function / method body 262 | max-locals=15 263 | 264 | # Maximum number of return / yield for function / method body 265 | max-returns=6 266 | 267 | # Maximum number of branch for function / method body 268 | max-branches=12 269 | 270 | # Maximum number of statements in function / method body 271 | max-statements=50 272 | 273 | # Maximum number of parents for a class (see R0901). 274 | max-parents=7 275 | 276 | # Maximum number of attributes for a class (see R0902). 277 | max-attributes=7 278 | 279 | # Minimum number of public methods for a class (see R0903). 280 | min-public-methods=0 281 | 282 | # Maximum number of public methods for a class (see R0904). 283 | max-public-methods=20 284 | 285 | # Maximum number of boolean expressions in a if statement 286 | max-bool-expr=5 287 | 288 | 289 | [CLASSES] 290 | 291 | # List of method names used to declare (i.e. assign) instance attributes. 292 | defining-attr-methods=__init__,__new__,setUp 293 | 294 | # List of valid names for the first argument in a class method. 295 | valid-classmethod-first-arg=cls 296 | 297 | # List of valid names for the first argument in a metaclass class method. 298 | valid-metaclass-classmethod-first-arg=mcs 299 | 300 | # List of member names, which should be excluded from the protected access 301 | # warning. 302 | exclude-protected=_asdict,_fields,_replace,_source,_make 303 | 304 | 305 | [EXCEPTIONS] 306 | 307 | # Exceptions that will emit a warning when being caught. Defaults to 308 | # "Exception" 309 | overgeneral-exceptions=Exception -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tesseract OCR on AWS Lambda 2 | 3 | AWS Lambda function to run tesseract OCR 4 | 5 | ## Getting Started 6 | 7 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. 8 | 9 | The idea is to use a docker container to simulate an AWS lambda environment this allows to build binaries against AWS lambda linux env. 10 | In this example I have build [leptonica](http://www.leptonica.com/) and [Tesseract Open Source OCR Engine](https://github.com/tesseract-ocr/tesseract). 11 | 12 | The whole idea is leveraged from [here](https://gist.github.com/barbolo/e59aa45ec8e425a26ec4da1086acfbc7) 13 | 14 | ### Prerequisites 15 | 16 | In order to get started you need docker. 17 | This is a very basic lamdba example and was tested on AWS Lambda Python3.6 environment in 11/2018. 18 | AWS deployment will be automated using [serverless framework](https://serverless.com/) 19 | 20 | ### Installing 21 | 22 | #### Install Node.js (Ubuntu) 23 | 24 | Add latest release, add this PPA 25 | 26 | ```bash 27 | curl -sL https://deb.nodesource.com/setup_10.x | sudo bash - 28 | ``` 29 | 30 | To install the LTS release, use this PPA 31 | 32 | ```bash 33 | curl -sL https://deb.nodesource.com/setup_8.x | sudo bash - 34 | ``` 35 | 36 | Install Nodejs and nvm 37 | 38 | ```bash 39 | sudo apt install nodejs 40 | ``` 41 | 42 | Verify installation 43 | 44 | ```bash 45 | node -v 46 | npm -v 47 | ``` 48 | 49 | Other OS installation guides can be found [here](https://nodejs.org/en/download/package-manager/) 50 | 51 | #### Install Serverless 52 | 53 | ```bash 54 | # Install serverless globally 55 | npm install serverless -g 56 | ``` 57 | 58 | #### Clone Repository 59 | 60 | Clone the repository and follow the install dependencies steps. 61 | 62 | #### Install aws-cli 63 | 64 | ##### Using Python3 venv 65 | 66 | In the project directory create python3 venv 67 | 68 | ```bash 69 | # create venv with name tessenv 70 | python3 -m venv tessenv 71 | ``` 72 | 73 | activate the virtual env 74 | 75 | ```bash 76 | source ./tessenv/bin/activate 77 | ``` 78 | 79 | verify venv is active pip 80 | 81 | ```bash 82 | which pip 83 | #result somepath/tessenv/bin/pip 84 | ``` 85 | 86 | Install aws-cli 87 | 88 | ```bash 89 | pip install awscli 90 | ``` 91 | 92 | ##### Generate AWS access keys 93 | 94 | Follow the AWS [tutorial](https://aws.amazon.com/premiumsupport/knowledge-center/create-access-key/) to create access keys 95 | for your user. 96 | 97 | ##### Setup AWS access keys 98 | 99 | ```bash 100 | $ aws configure 101 | AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE(sample) 102 | AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY(sample) 103 | Default region name [None]: us-west-2 104 | Default output format [None]: json 105 | ``` 106 | 107 | Test aws access to list available s3 buckets 108 | 109 | ```bash 110 | aws s3 ls 111 | ``` 112 | 113 | Additional [documentation](https://serverless.com/framework/docs/providers/aws/guide/credentials/) 114 | 115 | ### Tesseract lamda layer 116 | 117 | #### Build custom lamda layer 118 | 119 | A previous version of that example packaged all dependencies 120 | into a zip file which made the deployment slow due to the large size. 121 | 122 | One solution is using lambda layer to decouple binary dependencies from the actual lambda code. Both component could be defined in one serverless file but to really leverage decoupelling seperation is recommended. 123 | 124 | [AWS Lambda Layer](https://docs.aws.amazon.com/lambda/latest/dg/configuration-layers.html) 125 | 126 | ```bash 127 | cd tesseract-layer 128 | ``` 129 | 130 | Build lambda layer using lambci/lambda docker container. 131 | 132 | ```bash 133 | ./build.sh 134 | ``` 135 | 136 | By default English best (slow) tesseract model will be 137 | bundled into Lambda layer, but you can override it using 138 | `-m` parameter (for model type) and `-l` parameter (comma-separated 139 | list of languages), for example: 140 | 141 | ```bash 142 | ./build.sh -l eng,por -m fast # downloads FAST models for English and Portugese 143 | ``` 144 | 145 | Verify the folder layer has been created and contains the following folders 146 | 147 | ```bash 148 | $ ls layer 149 | bin #compiled tesseract binary 150 | data #tesseract language package eng 151 | lib #compiled lib dependencies 152 | python #python dependencies 153 | ``` 154 | 155 | Package the lambda layer 156 | 157 | ```bash 158 | serverless package 159 | ``` 160 | 161 | Verify the tesseract-layer/.serverless directory has been created and contains a 38MB file `tesseractPython36.zip`. 162 | 163 | #### Deploy lambda layer 164 | 165 | Deploy tesseractPython36 layer to AWS (requires AWS-CLI with valid AWS access keys) 166 | 167 | ```bash 168 | $ serverless deploy 169 | Serverless: Packaging service... 170 | ... 171 | ... 172 | ... 173 | functions: 174 | None 175 | layers: 176 | tesseractPython36: arn:aws:lambda:ap-southeast-2:***************:layer:tesseractPython36:17 177 | ``` 178 | 179 | #### Update lambda function layer reference 180 | 181 | Every lambda layer deployment will bump up the lambda layer version. 182 | To make sure the lambda function is referencing the correct version update the version part of 183 | returned layer reference. The reference is output of the layer deployment. 184 | 185 | ```bash 186 | $ serverless deploy 187 | ... 188 | ... 189 | ... 190 | layers: 191 | tesseractPython36: arn:aws:lambda:ap-southeast-2:***************:layer:tesseractPython36:17 192 | ``` 193 | 194 | Alternative query the layer verion 195 | 196 | ```bash 197 | aws lambda get-layer-version --layer-name tesseractPython36 --version-number #versionNumber eg. 1 198 | ``` 199 | 200 | Find the and update the version number in the `serverless.yml` file in the root directory 201 | 202 | ```yml 203 | # serverless.yml 204 | . 205 | . 206 | . 207 | tesseract-layer: 208 | name: tesseractPython36 209 | version: 1 210 | . 211 | . 212 | . 213 | ``` 214 | 215 | ### Lambda Deployment 216 | 217 | Switch to the project root directory 218 | 219 | #### Install serverless plugin dependencies 220 | 221 | ```bash 222 | nmp install 223 | ``` 224 | 225 | #### Package lambda function 226 | 227 | ```bash 228 | serverless package 229 | ``` 230 | 231 | #### Deploy lambda function 232 | 233 | ```bash 234 | serverless deploy 235 | ``` 236 | 237 | ### Test OCR Lambda function 238 | 239 | The lambda function is accepting json post request 240 | 241 | ```json 242 | { 243 | "image64": "base64 endcoded image" 244 | } 245 | ``` 246 |
Lambda Test function 247 |

248 | 249 | #### Lambda Test function 250 | 251 | ```json 252 | { 253 | "image64": "" 254 | } 255 | ``` 256 |

257 |
258 | 259 | ## Built With 260 | 261 | * [Tesseract Open Source OCR Engine](https://github.com/tesseract-ocr/tesseract) 262 | * [leptonica](http://www.leptonica.com/) 263 | * [Docker](https://www.docker.com/) 264 | * [Serverless](https://serverless.com/) 265 | 266 | ## Contributing 267 | 268 | Please feel free to comment or contribute especially if your integrating with [serverless](https://serverless.com/) or [AWS SAM](https://docs.aws.amazon.com/lambda/latest/dg/deploying-lambda-apps.html) 269 | 270 | ## Authors 271 | 272 | * **Gerd Wittchen** - *Initial work* - [Idea](https://gist.github.com/barbolo/e59aa45ec8e425a26ec4da1086acfbc7) 273 | 274 | ## License 275 | 276 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details 277 | -------------------------------------------------------------------------------- /lambda_function.py: -------------------------------------------------------------------------------- 1 | import pytesseract 2 | import PIL.Image 3 | import io 4 | import os 5 | import json 6 | from base64 import b64decode 7 | 8 | 9 | LAMBDA_TASK_ROOT = os.environ.get('LAMBDA_TASK_ROOT', os.path.dirname(os.path.abspath(__file__))) 10 | os.environ["PATH"] += os.pathsep + LAMBDA_TASK_ROOT 11 | 12 | 13 | def call(event, context): 14 | print("Event Passed to Handler: " + json.dumps(event)) 15 | image_base64 = json.loads(event['body'])['image64'] 16 | binary = b64decode(image_base64) 17 | image = PIL.Image.open(io.BytesIO(binary)) 18 | text = pytesseract.image_to_string(image, config='--psm 6') 19 | 20 | message = { 21 | 'test': text 22 | } 23 | return { 24 | 'statusCode': 200, 25 | 'headers': {'Content-Type': 'application/json'}, 26 | 'body': json.dumps(message) 27 | } 28 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ocr-test", 3 | "version": "0.1.0", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "appdirectory": { 8 | "version": "0.1.0", 9 | "resolved": "https://registry.npmjs.org/appdirectory/-/appdirectory-0.1.0.tgz", 10 | "integrity": "sha1-62yBYyDnsqsW9e2ZfyjYIF31Y3U=", 11 | "dev": true 12 | }, 13 | "array-filter": { 14 | "version": "0.0.1", 15 | "resolved": "https://registry.npmjs.org/array-filter/-/array-filter-0.0.1.tgz", 16 | "integrity": "sha1-fajPLiZijtcygDWB/SH2fKzS7uw=", 17 | "dev": true 18 | }, 19 | "array-map": { 20 | "version": "0.0.0", 21 | "resolved": "https://registry.npmjs.org/array-map/-/array-map-0.0.0.tgz", 22 | "integrity": "sha1-iKK6tz0c97zVwbEYoAP2b2ZfpmI=", 23 | "dev": true 24 | }, 25 | "array-reduce": { 26 | "version": "0.0.0", 27 | "resolved": "https://registry.npmjs.org/array-reduce/-/array-reduce-0.0.0.tgz", 28 | "integrity": "sha1-FziZ0//Rx9k4PkR5Ul2+J4yrXys=", 29 | "dev": true 30 | }, 31 | "balanced-match": { 32 | "version": "1.0.0", 33 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", 34 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", 35 | "dev": true 36 | }, 37 | "bluebird": { 38 | "version": "3.5.3", 39 | "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", 40 | "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==", 41 | "dev": true 42 | }, 43 | "brace-expansion": { 44 | "version": "1.1.11", 45 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", 46 | "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", 47 | "dev": true, 48 | "requires": { 49 | "balanced-match": "^1.0.0", 50 | "concat-map": "0.0.1" 51 | } 52 | }, 53 | "concat-map": { 54 | "version": "0.0.1", 55 | "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", 56 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", 57 | "dev": true 58 | }, 59 | "core-js": { 60 | "version": "2.3.0", 61 | "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.3.0.tgz", 62 | "integrity": "sha1-+rg/uwstjchfpjbEudNMdUIMbWU=", 63 | "dev": true 64 | }, 65 | "core-util-is": { 66 | "version": "1.0.2", 67 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 68 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", 69 | "dev": true 70 | }, 71 | "es6-promise": { 72 | "version": "3.0.2", 73 | "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.0.2.tgz", 74 | "integrity": "sha1-AQ1YWEI6XxGJeWZfRkhqlcbuK7Y=", 75 | "dev": true 76 | }, 77 | "fs-extra": { 78 | "version": "7.0.1", 79 | "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-7.0.1.tgz", 80 | "integrity": "sha512-YJDaCJZEnBmcbw13fvdAM9AwNOJwOzrE4pqMqBq5nFiEqXUqHwlK4B+3pUw6JNvfSPtX05xFHtYy/1ni01eGCw==", 81 | "dev": true, 82 | "requires": { 83 | "graceful-fs": "^4.1.2", 84 | "jsonfile": "^4.0.0", 85 | "universalify": "^0.1.0" 86 | } 87 | }, 88 | "fs.realpath": { 89 | "version": "1.0.0", 90 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 91 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", 92 | "dev": true 93 | }, 94 | "glob": { 95 | "version": "7.1.3", 96 | "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", 97 | "integrity": "sha512-vcfuiIxogLV4DlGBHIUOwI0IbrJ8HWPc4MU7HzviGeNho/UJDfi6B5p3sHeWIQ0KGIU0Jpxi5ZHxemQfLkkAwQ==", 98 | "dev": true, 99 | "requires": { 100 | "fs.realpath": "^1.0.0", 101 | "inflight": "^1.0.4", 102 | "inherits": "2", 103 | "minimatch": "^3.0.4", 104 | "once": "^1.3.0", 105 | "path-is-absolute": "^1.0.0" 106 | } 107 | }, 108 | "glob-all": { 109 | "version": "3.1.0", 110 | "resolved": "https://registry.npmjs.org/glob-all/-/glob-all-3.1.0.tgz", 111 | "integrity": "sha1-iRPd+17hrHgSZWJBsD1SF8ZLAqs=", 112 | "dev": true, 113 | "requires": { 114 | "glob": "^7.0.5", 115 | "yargs": "~1.2.6" 116 | } 117 | }, 118 | "graceful-fs": { 119 | "version": "4.1.15", 120 | "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.15.tgz", 121 | "integrity": "sha512-6uHUhOPEBgQ24HM+r6b/QwWfZq+yiFcipKFrOFiBEnWdy5sdzYoi+pJeQaPI5qOLRFqWmAXUPQNsielzdLoecA==", 122 | "dev": true 123 | }, 124 | "immediate": { 125 | "version": "3.0.6", 126 | "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", 127 | "integrity": "sha1-nbHb0Pr43m++D13V5Wu2BigN5ps=", 128 | "dev": true 129 | }, 130 | "inflight": { 131 | "version": "1.0.6", 132 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 133 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", 134 | "dev": true, 135 | "requires": { 136 | "once": "^1.3.0", 137 | "wrappy": "1" 138 | } 139 | }, 140 | "inherits": { 141 | "version": "2.0.3", 142 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 143 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", 144 | "dev": true 145 | }, 146 | "is-wsl": { 147 | "version": "1.1.0", 148 | "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-1.1.0.tgz", 149 | "integrity": "sha1-HxbkqiKwTRM2tmGIpmrzxgDDpm0=", 150 | "dev": true 151 | }, 152 | "isarray": { 153 | "version": "1.0.0", 154 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 155 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", 156 | "dev": true 157 | }, 158 | "jsonfile": { 159 | "version": "4.0.0", 160 | "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz", 161 | "integrity": "sha1-h3Gq4HmbZAdrdmQPygWPnBDjPss=", 162 | "dev": true, 163 | "requires": { 164 | "graceful-fs": "^4.1.6" 165 | } 166 | }, 167 | "jsonify": { 168 | "version": "0.0.0", 169 | "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz", 170 | "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM=", 171 | "dev": true 172 | }, 173 | "jszip": { 174 | "version": "3.1.5", 175 | "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.1.5.tgz", 176 | "integrity": "sha512-5W8NUaFRFRqTOL7ZDDrx5qWHJyBXy6velVudIzQUSoqAAYqzSh2Z7/m0Rf1QbmQJccegD0r+YZxBjzqoBiEeJQ==", 177 | "dev": true, 178 | "requires": { 179 | "core-js": "~2.3.0", 180 | "es6-promise": "~3.0.2", 181 | "lie": "~3.1.0", 182 | "pako": "~1.0.2", 183 | "readable-stream": "~2.0.6" 184 | } 185 | }, 186 | "lie": { 187 | "version": "3.1.1", 188 | "resolved": "https://registry.npmjs.org/lie/-/lie-3.1.1.tgz", 189 | "integrity": "sha1-mkNrLMd0bKWd56QfpGmz77dr2H4=", 190 | "dev": true, 191 | "requires": { 192 | "immediate": "~3.0.5" 193 | } 194 | }, 195 | "lodash.get": { 196 | "version": "4.4.2", 197 | "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", 198 | "integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk=", 199 | "dev": true 200 | }, 201 | "lodash.set": { 202 | "version": "4.3.2", 203 | "resolved": "https://registry.npmjs.org/lodash.set/-/lodash.set-4.3.2.tgz", 204 | "integrity": "sha1-2HV7HagH3eJIFrDWqEvqGnYjCyM=", 205 | "dev": true 206 | }, 207 | "lodash.uniqby": { 208 | "version": "4.7.0", 209 | "resolved": "https://registry.npmjs.org/lodash.uniqby/-/lodash.uniqby-4.7.0.tgz", 210 | "integrity": "sha1-2ZwHpmnp5tJOE2Lf4mbGdhavEwI=", 211 | "dev": true 212 | }, 213 | "lodash.values": { 214 | "version": "4.3.0", 215 | "resolved": "https://registry.npmjs.org/lodash.values/-/lodash.values-4.3.0.tgz", 216 | "integrity": "sha1-o6bCsOvsxcLLocF+bmIP6BtT00c=", 217 | "dev": true 218 | }, 219 | "md5-file": { 220 | "version": "4.0.0", 221 | "resolved": "https://registry.npmjs.org/md5-file/-/md5-file-4.0.0.tgz", 222 | "integrity": "sha512-UC0qFwyAjn4YdPpKaDNw6gNxRf7Mcx7jC1UGCY4boCzgvU2Aoc1mOGzTtrjjLKhM5ivsnhoKpQVxKPp+1j1qwg==", 223 | "dev": true 224 | }, 225 | "minimatch": { 226 | "version": "3.0.4", 227 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", 228 | "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", 229 | "dev": true, 230 | "requires": { 231 | "brace-expansion": "^1.1.7" 232 | } 233 | }, 234 | "minimist": { 235 | "version": "0.1.0", 236 | "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.1.0.tgz", 237 | "integrity": "sha1-md9lelJXTCHJBXSX33QnkLK0wN4=", 238 | "dev": true 239 | }, 240 | "once": { 241 | "version": "1.4.0", 242 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 243 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 244 | "dev": true, 245 | "requires": { 246 | "wrappy": "1" 247 | } 248 | }, 249 | "pako": { 250 | "version": "1.0.7", 251 | "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.7.tgz", 252 | "integrity": "sha512-3HNK5tW4x8o5mO8RuHZp3Ydw9icZXx0RANAOMzlMzx7LVXhMJ4mo3MOBpzyd7r/+RUu8BmndP47LXT+vzjtWcQ==", 253 | "dev": true 254 | }, 255 | "path-is-absolute": { 256 | "version": "1.0.1", 257 | "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", 258 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", 259 | "dev": true 260 | }, 261 | "process-nextick-args": { 262 | "version": "1.0.7", 263 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", 264 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", 265 | "dev": true 266 | }, 267 | "readable-stream": { 268 | "version": "2.0.6", 269 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.0.6.tgz", 270 | "integrity": "sha1-j5A0HmilPMySh4jaz80Rs265t44=", 271 | "dev": true, 272 | "requires": { 273 | "core-util-is": "~1.0.0", 274 | "inherits": "~2.0.1", 275 | "isarray": "~1.0.0", 276 | "process-nextick-args": "~1.0.6", 277 | "string_decoder": "~0.10.x", 278 | "util-deprecate": "~1.0.1" 279 | } 280 | }, 281 | "rimraf": { 282 | "version": "2.6.3", 283 | "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", 284 | "integrity": "sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==", 285 | "dev": true, 286 | "requires": { 287 | "glob": "^7.1.3" 288 | } 289 | }, 290 | "serverless-plugin-existing-s3": { 291 | "version": "2.3.0", 292 | "resolved": "https://registry.npmjs.org/serverless-plugin-existing-s3/-/serverless-plugin-existing-s3-2.3.0.tgz", 293 | "integrity": "sha512-lFj/QHvfiYQgCiW6FwZNL537mS5BgM3GyzyzhlMaKulGPQs0Ja4vO/l9eQgjZXQeShLeGDrHlFKY0thVLyFQeg==" 294 | }, 295 | "serverless-pseudo-parameters": { 296 | "version": "2.4.0", 297 | "resolved": "https://registry.npmjs.org/serverless-pseudo-parameters/-/serverless-pseudo-parameters-2.4.0.tgz", 298 | "integrity": "sha512-lb9R62PUFdEAbbYH7pe1wzR7vtIpa8YI8OVcQ5LlLyE0+AxWG4bwEw33X5LE8+5oLwTy57Y/EevnxKnMeyiXxw==" 299 | }, 300 | "serverless-python-requirements": { 301 | "version": "4.2.5", 302 | "resolved": "https://registry.npmjs.org/serverless-python-requirements/-/serverless-python-requirements-4.2.5.tgz", 303 | "integrity": "sha512-dmdgrSLWgJv2g/LIfFdBIkJFn2duNFRZc7De94AHf2ylZS7s+gKJ8Ga090YdaigWLhH65tMA50Ub06SUk0p6EQ==", 304 | "dev": true, 305 | "requires": { 306 | "appdirectory": "^0.1.0", 307 | "bluebird": "^3.0.6", 308 | "fs-extra": "^7.0.0", 309 | "glob-all": "^3.1.0", 310 | "is-wsl": "^1.1.0", 311 | "jszip": "^3.1.0", 312 | "lodash.get": "^4.4.2", 313 | "lodash.set": "^4.3.2", 314 | "lodash.uniqby": "^4.0.0", 315 | "lodash.values": "^4.3.0", 316 | "md5-file": "^4.0.0", 317 | "rimraf": "^2.6.2", 318 | "shell-quote": "^1.6.1" 319 | } 320 | }, 321 | "shell-quote": { 322 | "version": "1.6.1", 323 | "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.6.1.tgz", 324 | "integrity": "sha1-9HgZSczkAmlxJ0MOo7PFR29IF2c=", 325 | "dev": true, 326 | "requires": { 327 | "array-filter": "~0.0.0", 328 | "array-map": "~0.0.0", 329 | "array-reduce": "~0.0.0", 330 | "jsonify": "~0.0.0" 331 | } 332 | }, 333 | "string_decoder": { 334 | "version": "0.10.31", 335 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", 336 | "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", 337 | "dev": true 338 | }, 339 | "universalify": { 340 | "version": "0.1.2", 341 | "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz", 342 | "integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==", 343 | "dev": true 344 | }, 345 | "util-deprecate": { 346 | "version": "1.0.2", 347 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 348 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=", 349 | "dev": true 350 | }, 351 | "wrappy": { 352 | "version": "1.0.2", 353 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 354 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", 355 | "dev": true 356 | }, 357 | "yargs": { 358 | "version": "1.2.6", 359 | "resolved": "https://registry.npmjs.org/yargs/-/yargs-1.2.6.tgz", 360 | "integrity": "sha1-nHtKgv1dWVsr8Xq23MQxNUMv40s=", 361 | "dev": true, 362 | "requires": { 363 | "minimist": "^0.1.0" 364 | } 365 | } 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ocr-test", 3 | "description": "", 4 | "version": "0.1.0", 5 | "dependencies": { 6 | "serverless-plugin-existing-s3": "^2.2.2", 7 | "serverless-pseudo-parameters": "^2.4.0" 8 | }, 9 | "devDependencies": { 10 | "serverless-python-requirements": "^4.2.5" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /requirements/requirements-dev.in: -------------------------------------------------------------------------------- 1 | cython==0.29.1 2 | pillow==5.3.0 3 | pytesseract==0.2.4 4 | tesserocr==2.3.1 5 | 6 | argparse 7 | requests 8 | 9 | -r requirements.in -------------------------------------------------------------------------------- /requirements/requirements.in: -------------------------------------------------------------------------------- 1 | # tessleract dependencies are provided from lambda-layer 2 | # for local development and testing install requirements-dev.txt -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: ocr-test 2 | 3 | package: 4 | exclude: 5 | - .venv/** 6 | - node_modules/** 7 | - .git/** 8 | - '**/*.pyc' 9 | - tesseract-layer/** 10 | - requirements/** 11 | - tests/** 12 | - test* 13 | - .vscode/** 14 | - tessenv/** 15 | 16 | plugins: 17 | - serverless-python-requirements 18 | - serverless-plugin-existing-s3 19 | - serverless-pseudo-parameters 20 | custom: 21 | pythonRequirements: 22 | fileName: requirements/requirements.in 23 | zip: false 24 | dockerizePip: true 25 | app_acronym: ocr-test 26 | default_stage: dev 27 | tesseract-layer: 28 | name: tesseractPython36 29 | version: 1 30 | tessdata: /opt/data/tessdata 31 | stage: ${opt:stage, self:custom.default_stage} 32 | stack_name: ${self:custom.app_acronym}-${self:custom.stage} 33 | region: ${opt:region, self:provider.region} 34 | account_id: 35 | Value: Ref! "AWS::AccountId" 36 | # Put this here rather than in code (presigned URL TTL) 37 | url_default_ttl: 60 38 | 39 | provider: 40 | name: aws 41 | runtime: python3.6 42 | stage: ${opt:stage, 'dev'} 43 | region: ap-southeast-2 44 | environment: 45 | REGION: ${self:custom.region} 46 | URL_DEFAULT_TTL: ${self:custom.url_default_ttl} 47 | TESSDATA_PREFIX: ${self:custom.tessdata} 48 | 49 | functions: 50 | ocr-test: 51 | handler: lambda_function.call 52 | name: ${self:custom.stack_name}-ocr-test 53 | timeout: 60 54 | events: 55 | - http: 56 | method: post 57 | path: /ocr 58 | cors: true 59 | layers: 60 | - "arn:aws:lambda:#{AWS::Region}:#{AWS::AccountId}:layer:${self:custom.tesseract-layer.name}:${self:custom.tesseract-layer.version}" 61 | -------------------------------------------------------------------------------- /tesseract-layer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM lambci/lambda:build-python3.6 2 | 3 | # define env 4 | ENV LEPTONICA_VERSION=leptonica-1.76.0 5 | ENV TESSERACT_VERSION=4.0.0 6 | ENV PYTHON_VERSION=3.6.1 7 | 8 | # tesseract data parameters 9 | ARG TESSERACT_LANG 10 | ARG TESSERACT_MODE 11 | 12 | ## install dependencies 13 | RUN yum -y clean expire-cache && yum -y makecache fast && yum -y update && yum -y install tar xz gcc gcc-c++ make autoconf aclocal automake libtool findutils \ 14 | libjpeg-devel libpng-devel libtiff-devel zlib-devel \ 15 | libzip-devel freetype-devel lcms2-devel libwebp-devel \ 16 | tcl-devel tk-devel wget tar diffutils autoconf automake \ 17 | libjpeg8-devel libtiff5-devel zlib1g-devel zip 18 | 19 | ## build python 20 | RUN curl -O https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz 21 | RUN tar xf Python-${PYTHON_VERSION}.tar.xz 22 | RUN mkdir -p /var/task/python 23 | WORKDIR Python-${PYTHON_VERSION} 24 | RUN ./configure --prefix=/var/task/python 25 | RUN make -j9 && make altinstall 26 | 27 | ## build leptonica 28 | RUN mkdir -p "/tmp/${LEPTONICA_VERSION}-build" 29 | WORKDIR "/tmp/${LEPTONICA_VERSION}-build" 30 | RUN curl -L -o "${LEPTONICA_VERSION}.tar.gz" "http://www.leptonica.org/source/${LEPTONICA_VERSION}.tar.gz" \ 31 | && tar -zxvf ${LEPTONICA_VERSION}.tar.gz \ 32 | && cd ${LEPTONICA_VERSION} && ./configure && make && make install 33 | 34 | # build tesseract 35 | RUN mkdir -p "/tmp/tesseract-${TESSERACT_VERSION}-build" 36 | WORKDIR "/tmp/${TESSERACT_VERSION}-build" 37 | RUN wget https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz && tar -zxvf ${TESSERACT_VERSION}.tar.gz \ 38 | && cd tesseract-${TESSERACT_VERSION} && ./autogen.sh && \ 39 | LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib" \ 40 | LIBLEPT_HEADERSDIR="/usr/local/include/leptonica" \ 41 | PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig" ./configure \ 42 | && LDFLAGS="-L/usr/local/lib" CFLAGS="-I/usr/local/include" make && make install 43 | 44 | RUN pip install \ 45 | --target=/var/task/python \ 46 | --global-option=build_ext --global-option="-L/var/lang/lib:/var/task/lib" \ 47 | --global-option=build_ext --global-option="-I/var/lang/include/python3.6m:/var/task/lib" \ tesserocr==2.3.1 48 | 49 | RUN pip install \ 50 | --target=/var/task/python/ pytesseract==0.2.5 51 | RUN pip install \ 52 | --target=/var/task/python/ --upgrade cython==0.29.1 53 | RUN pip install \ 54 | --target=/var/task/python/ --upgrade pillow==5.4.0 55 | 56 | RUN mkdir -p /var/task/tessdata 57 | RUN for lang in $TESSERACT_LANG; do wget https://github.com/tesseract-ocr/tessdata_${TESSERACT_MODE}/raw/master/$lang.traineddata -P /var/task/tessdata; done 58 | 59 | RUN rm -rf /var/task/Python-3.6.1* 60 | RUN ls /var/task/python/bin 61 | -------------------------------------------------------------------------------- /tesseract-layer/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | showHelp() { 4 | # `cat << EOF` This means that cat should stop reading when EOF is detected 5 | cat << EOF 6 | Usage: ./build -vnc [-hvnc] 7 | 8 | -h, Display help 9 | 10 | -c, clean rebuild docker container using no-cache 11 | 12 | -l, list of space-delimited tesseract languages (default: eng) 13 | 14 | -m, tesseract model: fast/best (default: best) 15 | 16 | EOF 17 | # EOF is found above and hence cat command stops reading. This is equivalent to echo but much neater when printing out. 18 | } 19 | 20 | 21 | 22 | export DOCKER_ARG="" 23 | TESSERACT_LANG="eng" 24 | TESSERACT_MODE="best" 25 | while getopts "m:l:hc" opt; do 26 | case ${opt} in 27 | h ) showHelp 28 | exit 0 29 | ;; 30 | c ) export DOCKER_ARG="--no-cache" 31 | ;; 32 | m ) TESSERACT_MODE=$OPTARG 33 | ;; 34 | l ) TESSERACT_LANG=$(echo $OPTARG | tr ',' ' ') 35 | ;; 36 | \? ) showHelp 37 | ;; 38 | esac 39 | done 40 | set -e 41 | 42 | echo "$DOCKER_ARG" 43 | 44 | # define required libs 45 | 46 | libarray=(/usr/local/lib/libtesseract.so.4 \ 47 | /usr/local/lib/liblept.so.5 \ 48 | /usr/lib64/libgomp.so.1 \ 49 | /usr/lib64/librt.so \ 50 | /usr/lib64/libz.so \ 51 | /usr/lib64/libm.so \ 52 | /usr/lib64/libpng12.so.0 \ 53 | /usr/lib64/libjpeg.so.62 \ 54 | /usr/lib64/libtiff.so.5 \ 55 | /usr/lib64/libpthread.so \ 56 | /usr/lib64/libstdc++.so.6 \ 57 | /usr/lib64/libjbig.so.2.0 \ 58 | /usr/lib64/libwebp.so.4) 59 | 60 | pythondeps=(tesserocr.cpython-36m-x86_64-linux-gnu.so \ 61 | pytesseract-0.2.5.dist-info \ 62 | pytesseract \ 63 | Pillow-5.4.0.dist-info \ 64 | PIL \ 65 | Cython \ 66 | Cython-0.29.1.dist-info \ 67 | cython.py \ 68 | pyximport) 69 | 70 | binaries=(/var/task/python/bin/cythonize \ 71 | /var/task/python/bin/cython \ 72 | /var/task/python/bin/cygdb ) 73 | 74 | 75 | LAMBDA_DIR=layer 76 | 77 | rm -rf layer 78 | mkdir -p layer/python/bin 79 | mkdir -p layer/{lib,bin,data} 80 | docker build $DOCKER_ARG --build-arg TESSERACT_LANG="$TESSERACT_LANG" --build-arg TESSERACT_MODE="$TESSERACT_MODE" -t tessleract-builder -f Dockerfile . 81 | CONTAINER=$(docker run -d tessleract-builder false) 82 | 83 | # copy libs 84 | for lib in "${libarray[@]}" 85 | do 86 | docker cp -L \ 87 | $CONTAINER:$lib $LAMBDA_DIR/lib 88 | done 89 | 90 | # copy python dependencies 91 | for dep in "${pythondeps[@]}" 92 | do 93 | docker cp \ 94 | $CONTAINER:/var/task/python/$dep $LAMBDA_DIR/python/. 95 | done 96 | 97 | 98 | # copy python binaries 99 | for bin in "${binaries[@]}" 100 | do 101 | docker cp -a \ 102 | $CONTAINER:$bin $LAMBDA_DIR/python/bin 103 | done 104 | 105 | 106 | docker cp \ 107 | $CONTAINER:/var/task/tessdata $LAMBDA_DIR/data/tessdata 108 | 109 | docker cp \ 110 | $CONTAINER:/usr/local/bin/tesseract $LAMBDA_DIR/bin/ 111 | 112 | docker rm $CONTAINER 113 | -------------------------------------------------------------------------------- /tesseract-layer/serverless.yml: -------------------------------------------------------------------------------- 1 | service: python-tesseract-layers 2 | frameworkVersion: ">=1.34.0 <2.0.0" 3 | 4 | custom: 5 | default_stage: dev 6 | stage: ${opt:stage, self:custom.default_stage} 7 | tessdata: tessdata 8 | region: ap-southeast-2 9 | 10 | provider: 11 | name: aws 12 | stage: ${opt:stage, 'dev'} 13 | region: ap-southeast-2 14 | environment: 15 | REGION: ${self:custom.region} 16 | TESSDATA_PREFIX: ${self:custom.tessdata} 17 | 18 | layers: 19 | tesseractPython36: 20 | path: layer 21 | description: A Layer to support tesseract in Python 3.6 Lambda functions 22 | compatibleRuntimes: 23 | - python3.6 -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--url", help="aws lambda url") 6 | args = parser.parse_args() 7 | print(args.url) 8 | 9 | url = args.url 10 | 11 | payload = "{\"image64\": \"\"\n}" 12 | headers = { 13 | 'content-type': "application/json", 14 | 'cache-control': "no-cache", 15 | 'postman-token': "97476aaa-a889-85e0-e409-d4010c29a662" 16 | } 17 | 18 | response = requests.request("POST", url, data=payload, headers=headers) 19 | 20 | print(response.text) --------------------------------------------------------------------------------