├── .gitattributes ├── .gitignore ├── Dockerfile ├── LICENCE ├── README.md ├── __init__.py ├── app.env ├── app.py ├── helpers ├── __init__.py ├── logger.py └── temp.py ├── lib ├── __init__.py ├── models │ ├── jar-nojar │ │ ├── deploy.prototxt │ │ └── weights.caffemodel │ └── text-notext │ │ ├── deploy.prototxt │ │ └── weights.caffemodel └── selectivesearch.py ├── requirements.txt ├── set_env.sh └── stages ├── __init__.py ├── region_proposal ├── __init__.py ├── extract_seal.py ├── region_grouping.py └── region_search.py ├── symbol_classification.py ├── symbol_segmentation.py └── text_region_extraction ├── __init__.py ├── region_classification.py └── text_region_formulation.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.png filter=lfs diff=lfs merge=lfs -text 2 | *.jpg filter=lfs diff=lfs merge=lfs -text 3 | *.jpeg filter=lfs diff=lfs merge=lfs -text 4 | *.gif filter=lfs diff=lfs merge=lfs -text 5 | *.caffemodel filter=lfs diff=lfs merge=lfs -text 6 | *.solverstate filter=lfs diff=lfs merge=lfs -text 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | *.log 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM koallen/anaconda-caffe:gpu 2 | 3 | # create an alias for caffe 4 | RUN echo "alias caffe='/root/caffe/build/tools/caffe'" >> ~/.bashrc 5 | 6 | # install OpenCV 7 | RUN conda install -y opencv 8 | 9 | # change working directory 10 | WORKDIR /root/workspace/ 11 | 12 | # install python requirements 13 | COPY requirements.txt /root/workspace/requirements.txt 14 | RUN pip install -r requirements.txt 15 | 16 | # launch command line 17 | ENTRYPOINT ["/bin/bash"] 18 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Satish Palaniappan, Ronojoy Adhikari 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Indus Script OCR 2 | 3 | To automatically locate text patches/regions, segment individual symbols/characters from those regions and also identify each symbol/character belonging to the Indus Script, given images of Indus seals from archaeological sites, using image processing and deep learning techniques. [WIP] 4 | 5 | View our research article titled "__Deep Learning the Indus Script__" arXived at: [arXiv:1702.00523v1](https://arxiv.org/abs/1702.00523v1) 6 | 7 | ## Deploying the app 8 | 9 | - Setup the GPU machine to run the service, 10 | - Install latest nvidia drivers, from `http://www.geforce.com/drivers` 11 | - Install the nvidia-docker plug-in over docker, from `https://github.com/NVIDIA/nvidia-docker/releases` 12 | - Make sure you have `git-lfs` installed (https://git-lfs.github.com/) 13 | 14 | - Launch the service, 15 | - Build the docker image: `nvidia-docker build --no-cache=true -t indus-script-ocr:latest .` 16 | - To launch a docker container: `nvidia-docker run -it -v "$PWD":/root/workspace --rm --env-file app.env --name indus-script-ocr-service indus-script-ocr:latest` 17 | 18 | ## Press Coverage: 19 | 20 | - [The Verge](http://www.theverge.com/2017/1/25/14371450/indus-valley-civilization-ancient-seals-symbols-language-algorithms-ai#EQQA6r) 21 | - [The Hindu](http://www.thehindu.com/sci-tech/science/chennai-team-taps-ai-to-read-indus-script/article17448690.ece) 22 | - [Times of India](http://timesofindia.indiatimes.com/city/chennai/app-may-help-decipher-indus-valley-symbols/articleshow/57281369.cms) 23 | - [SBS Radio, Australia](http://www.sbs.com.au/yourlanguage/tamil/en/content/app-decipher-ancient-symbols?language=en) 24 | 25 | ## Talks 26 | 27 | - **Indian Deep Learning Initiative (IDLI):** [slide deck](https://github.com/tpsatish95/talks/blob/master/Deep\%20learning\%20based\%20OCR\%20engine\%20for\%20the\%20Indus\%20script\%20-\%20IDLI\%20Talk.pdf), [video](https://www.youtube.com/watch?v=qPF1oR9yMNY}), [link](https://www.facebook.com/groups/idliai/) 28 | - **ThoughtWorks Geek Night:** [slide deck](https://github.com/tpsatish95/talks/blob/master/Deep\%20learning\%20based\%20OCR\%20engine\%20for\%20the\%20Indus\%20script\%20-\%20TW\%20Geek\%20Night.pdf), [video](https://www.youtube.com/watch?v=g7v4QaCD-UQ), [link](https://twchennai.github.io/geeknight/edition-43.html) 29 | - **ChennaiPy:** [link](http://chennaipy.org/may-2017-meet-minutes.html) 30 | - **Anthill Inside 2017:** [proposal](https://anthillinside.talkfunnel.com/2017/15-deep-learning-based-ocr-engine-for-the-indus-scrip) 31 | 32 | ## Citation 33 | 34 | Please cite `indus-script-ocr` in your publications if it helps your research: 35 | 36 | @article{palaniappan2017deep, 37 | title={Deep Learning the Indus Script}, 38 | author={Palaniappan, Satish and Adhikari, Ronojoy}, 39 | journal={arXiv preprint arXiv:1702.00523}, 40 | year={2017} 41 | } 42 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/__init__.py -------------------------------------------------------------------------------- /app.env: -------------------------------------------------------------------------------- 1 | # environment variables 2 | 3 | # paths 4 | CAFFE_PATH=/root/caffe/build/tools/caffe 5 | TEXT_NOTEXT_MODELS_DIR=./lib/models/text-notext 6 | JAR_NOJAR_MODELS_DIR=./lib/models/jar-nojar 7 | 8 | # gpu computations switch (1-> GPU Computations and 0-> CPU Computations) 9 | IS_GPU=1 10 | 11 | # set the logger level (10 -> DEBUG, 20 -> INFO) 12 | LOG_LEVEL=10 13 | 14 | # suppress caffe logs 15 | # GLOG_minloglevel=2 16 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import skimage.io 4 | 5 | from helpers import logger 6 | 7 | import stages.region_proposal.extract_seal as region_proposal_extract_seal 8 | import stages.region_proposal.region_grouping as region_proposal_region_grouping 9 | import stages.region_proposal.region_search as region_proposal_region_search 10 | 11 | import stages.text_region_extraction.region_classification as text_region_extraction_region_classification 12 | import stages.text_region_extraction.text_region_formulation as text_region_extraction_text_region_formulation 13 | 14 | from stages import symbol_segmentation, symbol_classification 15 | 16 | LOGGER = logger.create_logger(__name__) 17 | 18 | 19 | def get_new_image_dimensions(image): 20 | 21 | LOGGER.info("Calculating the new image dimensions ...") 22 | 23 | img = skimage.io.imread(image.name) 24 | width = len(img[0]) 25 | height = len(img) 26 | 27 | if width * height < 256 * 256 * (0.95) and abs(width - height) <= 3: 28 | new_size = 512 29 | elif width * height < 220 * 220 * (1.11): 30 | new_size = 256 31 | elif width * height < 256 * 256: 32 | new_size = 256 33 | elif width * height > 512 * 512 * (0.99) and width < 800 and height < 800: 34 | new_size = 512 35 | elif width * height < 512 * 512 * (0.95) and width * height > 256 * 256 * (1.15): 36 | new_size = 512 37 | 38 | new_height = int(new_size * height / width) 39 | new_width = new_size 40 | 41 | return new_width, new_height 42 | 43 | 44 | def get_text_regions(seal, new_width, new_height): 45 | # region_proposal 46 | candidate_regions = \ 47 | region_proposal_region_search.get_candidate_regions(seal, new_width, new_height) 48 | LOGGER.info(candidate_regions) 49 | grouped_regions = \ 50 | region_proposal_region_grouping.group_candidate_regions(candidate_regions, new_width, new_height) 51 | LOGGER.info(grouped_regions) 52 | 53 | # text_region_extraction 54 | text_regions, no_text_regions, both_regions = \ 55 | text_region_extraction_region_classification.process_regions(seal, grouped_regions, new_width, new_height) 56 | formulated_text_regions = \ 57 | text_region_extraction_text_region_formulation.process_regions(text_regions, no_text_regions, both_regions, new_width, new_height) 58 | 59 | return formulated_text_regions 60 | 61 | 62 | def get_best_text_regions(seal, new_width, new_height): 63 | orig_image = skimage.io.imread(seal.name) 64 | orig_width = len(orig_image[0]) 65 | orig_height = len(orig_image) 66 | 67 | all_dimensions = set([256, 512, orig_width]) 68 | tried_dimensions = set() 69 | 70 | while True: 71 | tried_dimensions.add(new_width) 72 | text_regions = get_text_regions(seal, new_width, new_height) 73 | 74 | # min area check 75 | is_less_min_area = False 76 | for x, y, w, h in text_regions: 77 | if w * h < new_width * new_height * 0.20 and (w < new_width * 0.20 or h < new_height * 0.20): 78 | is_less_min_area = True 79 | 80 | if (len(text_regions) == 0 or is_less_min_area) and len(tried_dimensions) < 3: 81 | new_width = list(all_dimensions - tried_dimensions)[0] 82 | new_height = int(new_width * orig_height / orig_width) 83 | LOGGER.info("New size being tried: " + str(new_width)) 84 | 85 | else: 86 | return text_regions, new_width, new_height 87 | 88 | 89 | def process(image_path): 90 | seal = region_proposal_extract_seal.crop_white(image_path) 91 | new_width, new_height = get_new_image_dimensions(seal) 92 | best_text_regions, updated_width, updated_height = get_best_text_regions(seal, new_width, new_height) 93 | symbols = symbol_segmentation.get_symbols(seal, best_text_regions, updated_width, updated_height) 94 | symbol_sequence = symbol_classification.process_symbols(symbols) 95 | 96 | LOGGER.info("The symbol sequence: " + str([s[1] for s in symbol_sequence])) 97 | 98 | 99 | if __name__ == "__main__": 100 | input_artifact_image_path = sys.argv[1] 101 | process(input_artifact_image_path) 102 | -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/helpers/__init__.py -------------------------------------------------------------------------------- /helpers/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import os 4 | import signal 5 | import sys 6 | 7 | from pythonjsonlogger import jsonlogger 8 | 9 | 10 | CONSOLE_FORMATTER = logging.Formatter('[%(asctime)s] %(levelname)s --- %(message)s ' + 11 | '(%(filename)s:%(lineno)d - %(funcName)s())', 12 | datefmt='%Y-%m-%d %H:%M:%S') 13 | 14 | JSON_FORMATTER = jsonlogger.JsonFormatter('%(asctime)s %(levelname)s %(message)s ' + 15 | '%(filename)s %(module)s %(funcName)s %(lineno)d ', 16 | datefmt='%Y-%m-%d %H:%M:%S') 17 | 18 | 19 | LOG_HANDLER_CONSOLE = logging.StreamHandler(stream=sys.stdout) 20 | LOG_HANDLER_CONSOLE.setLevel(logging.INFO) 21 | LOG_HANDLER_CONSOLE.setFormatter(CONSOLE_FORMATTER) 22 | 23 | LOG_HANDLER_FILE = logging.FileHandler("indus_script_ocr.log", mode="a") 24 | LOG_HANDLER_FILE.setLevel(logging.DEBUG) 25 | LOG_HANDLER_FILE.setFormatter(JSON_FORMATTER) 26 | 27 | 28 | def create_logger(caller_name): 29 | 30 | logger = logging.getLogger(caller_name) 31 | logger.propagate = False 32 | logger.setLevel(int(os.environ["LOG_LEVEL"])) 33 | logger.addHandler(LOG_HANDLER_CONSOLE) 34 | logger.addHandler(LOG_HANDLER_FILE) 35 | 36 | return logger 37 | 38 | 39 | def signal_handler(signal, frame): 40 | logger = create_logger(__name__) 41 | logger.info("App terminated!") 42 | logging.shutdown() 43 | sys.exit(0) 44 | 45 | signal.signal(signal.SIGINT, signal_handler) 46 | -------------------------------------------------------------------------------- /helpers/temp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | 6 | class TemporaryDirectory(object): 7 | def __init__(self): 8 | self.name = tempfile.mkdtemp() 9 | 10 | def cleanup(self): 11 | shutil.rmtree(self.name) 12 | 13 | 14 | class TemporaryFile(object): 15 | def __init__(self, ext=""): 16 | temp_file = tempfile.NamedTemporaryFile(delete=False) 17 | temp_file.name = "".join([temp_file.name, ext]) 18 | 19 | self.fd = temp_file 20 | self.name = temp_file.name 21 | 22 | def cleanup(self): 23 | os.unlink(self.name) 24 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/lib/__init__.py -------------------------------------------------------------------------------- /lib/models/jar-nojar/deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "indus" 2 | 3 | input: "data" 4 | input_dim: 1 5 | input_dim: 1 6 | input_dim: 64 7 | input_dim: 64 8 | 9 | layers { 10 | name: "conv1" 11 | type: CONVOLUTION 12 | bottom: "data" 13 | top: "conv1" 14 | blobs_lr: 1 15 | blobs_lr: 2 16 | convolution_param { 17 | num_output: 20 18 | kernel_size: 5 19 | stride: 1 20 | weight_filler { 21 | type: "xavier" 22 | } 23 | bias_filler { 24 | type: "constant" 25 | } 26 | } 27 | } 28 | layers { 29 | name: "conv2" 30 | type: CONVOLUTION 31 | bottom: "conv1" 32 | top: "conv2" 33 | blobs_lr: 1 34 | blobs_lr: 2 35 | convolution_param { 36 | num_output: 50 37 | kernel_size: 5 38 | stride: 1 39 | weight_filler { 40 | type: "xavier" 41 | } 42 | bias_filler { 43 | type: "constant" 44 | } 45 | } 46 | } 47 | layers{ 48 | name: "dropout" 49 | type: DROPOUT 50 | bottom: "conv2" 51 | top: "dropout" 52 | } 53 | layers { 54 | name: "ip1" 55 | type: INNER_PRODUCT 56 | bottom: "dropout" 57 | top: "ip1" 58 | blobs_lr: 1 59 | blobs_lr: 2 60 | inner_product_param { 61 | num_output: 500 62 | weight_filler { 63 | type: "xavier" 64 | } 65 | bias_filler { 66 | type: "constant" 67 | } 68 | } 69 | } 70 | layers { 71 | name: "relu1" 72 | type: RELU 73 | bottom: "ip1" 74 | top: "ip1" 75 | } 76 | layers { 77 | name: "ip2" 78 | type: INNER_PRODUCT 79 | bottom: "ip1" 80 | top: "ip2" 81 | blobs_lr: 1 82 | blobs_lr: 2 83 | inner_product_param { 84 | num_output: 2 85 | weight_filler { 86 | type: "xavier" 87 | } 88 | bias_filler { 89 | type: "constant" 90 | } 91 | } 92 | } 93 | layers { 94 | name: "prob" 95 | type: SOFTMAX 96 | bottom: "ip2" 97 | top: "prob" 98 | } 99 | -------------------------------------------------------------------------------- /lib/models/jar-nojar/weights.caffemodel: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1aee444aa39b782fc575cb3cb1ee700de5a23c67e6f0949a72cbf9f8df3b5c5b 3 | size 313708914 4 | -------------------------------------------------------------------------------- /lib/models/text-notext/deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "GoogleNet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | layer { 10 | name: "conv1/7x7_s2" 11 | type: "Convolution" 12 | bottom: "data" 13 | top: "conv1/7x7_s2" 14 | param { 15 | lr_mult: 0 16 | decay_mult: 0 17 | } 18 | param { 19 | lr_mult: 0 20 | decay_mult: 0 21 | } 22 | convolution_param { 23 | num_output: 64 24 | pad: 3 25 | kernel_size: 7 26 | stride: 2 27 | weight_filler { 28 | type: "xavier" 29 | std: 0.1 30 | } 31 | bias_filler { 32 | type: "constant" 33 | value: 0.2 34 | } 35 | } 36 | } 37 | layer { 38 | name: "conv1/relu_7x7" 39 | type: "ReLU" 40 | bottom: "conv1/7x7_s2" 41 | top: "conv1/7x7_s2" 42 | } 43 | layer { 44 | name: "pool1/3x3_s2" 45 | type: "Pooling" 46 | bottom: "conv1/7x7_s2" 47 | top: "pool1/3x3_s2" 48 | pooling_param { 49 | pool: MAX 50 | kernel_size: 3 51 | stride: 2 52 | } 53 | } 54 | layer { 55 | name: "pool1/norm1" 56 | type: "LRN" 57 | bottom: "pool1/3x3_s2" 58 | top: "pool1/norm1" 59 | lrn_param { 60 | local_size: 5 61 | alpha: 0.0001 62 | beta: 0.75 63 | } 64 | } 65 | layer { 66 | name: "conv2/3x3_reduce" 67 | type: "Convolution" 68 | bottom: "pool1/norm1" 69 | top: "conv2/3x3_reduce" 70 | param { 71 | lr_mult: 0 72 | decay_mult: 0 73 | } 74 | param { 75 | lr_mult: 0 76 | decay_mult: 0 77 | } 78 | convolution_param { 79 | num_output: 64 80 | kernel_size: 1 81 | weight_filler { 82 | type: "xavier" 83 | std: 0.1 84 | } 85 | bias_filler { 86 | type: "constant" 87 | value: 0.2 88 | } 89 | } 90 | } 91 | layer { 92 | name: "conv2/relu_3x3_reduce" 93 | type: "ReLU" 94 | bottom: "conv2/3x3_reduce" 95 | top: "conv2/3x3_reduce" 96 | } 97 | layer { 98 | name: "conv2/3x3" 99 | type: "Convolution" 100 | bottom: "conv2/3x3_reduce" 101 | top: "conv2/3x3" 102 | param { 103 | lr_mult: 0 104 | decay_mult: 0 105 | } 106 | param { 107 | lr_mult: 0 108 | decay_mult: 0 109 | } 110 | convolution_param { 111 | num_output: 192 112 | pad: 1 113 | kernel_size: 3 114 | weight_filler { 115 | type: "xavier" 116 | std: 0.03 117 | } 118 | bias_filler { 119 | type: "constant" 120 | value: 0.2 121 | } 122 | } 123 | } 124 | layer { 125 | name: "conv2/relu_3x3" 126 | type: "ReLU" 127 | bottom: "conv2/3x3" 128 | top: "conv2/3x3" 129 | } 130 | layer { 131 | name: "conv2/norm2" 132 | type: "LRN" 133 | bottom: "conv2/3x3" 134 | top: "conv2/norm2" 135 | lrn_param { 136 | local_size: 5 137 | alpha: 0.0001 138 | beta: 0.75 139 | } 140 | } 141 | layer { 142 | name: "pool2/3x3_s2" 143 | type: "Pooling" 144 | bottom: "conv2/norm2" 145 | top: "pool2/3x3_s2" 146 | pooling_param { 147 | pool: MAX 148 | kernel_size: 3 149 | stride: 2 150 | } 151 | } 152 | layer { 153 | name: "inception_3a/1x1" 154 | type: "Convolution" 155 | bottom: "pool2/3x3_s2" 156 | top: "inception_3a/1x1" 157 | param { 158 | lr_mult: 0 159 | decay_mult: 0 160 | } 161 | param { 162 | lr_mult: 0 163 | decay_mult: 0 164 | } 165 | convolution_param { 166 | num_output: 64 167 | kernel_size: 1 168 | weight_filler { 169 | type: "xavier" 170 | std: 0.03 171 | } 172 | bias_filler { 173 | type: "constant" 174 | value: 0.2 175 | } 176 | } 177 | } 178 | layer { 179 | name: "inception_3a/relu_1x1" 180 | type: "ReLU" 181 | bottom: "inception_3a/1x1" 182 | top: "inception_3a/1x1" 183 | } 184 | layer { 185 | name: "inception_3a/3x3_reduce" 186 | type: "Convolution" 187 | bottom: "pool2/3x3_s2" 188 | top: "inception_3a/3x3_reduce" 189 | param { 190 | lr_mult: 0 191 | decay_mult: 0 192 | } 193 | param { 194 | lr_mult: 0 195 | decay_mult: 0 196 | } 197 | convolution_param { 198 | num_output: 96 199 | kernel_size: 1 200 | weight_filler { 201 | type: "xavier" 202 | std: 0.09 203 | } 204 | bias_filler { 205 | type: "constant" 206 | value: 0.2 207 | } 208 | } 209 | } 210 | layer { 211 | name: "inception_3a/relu_3x3_reduce" 212 | type: "ReLU" 213 | bottom: "inception_3a/3x3_reduce" 214 | top: "inception_3a/3x3_reduce" 215 | } 216 | layer { 217 | name: "inception_3a/3x3" 218 | type: "Convolution" 219 | bottom: "inception_3a/3x3_reduce" 220 | top: "inception_3a/3x3" 221 | param { 222 | lr_mult: 0 223 | decay_mult: 0 224 | } 225 | param { 226 | lr_mult: 0 227 | decay_mult: 0 228 | } 229 | convolution_param { 230 | num_output: 128 231 | pad: 1 232 | kernel_size: 3 233 | weight_filler { 234 | type: "xavier" 235 | std: 0.03 236 | } 237 | bias_filler { 238 | type: "constant" 239 | value: 0.2 240 | } 241 | } 242 | } 243 | layer { 244 | name: "inception_3a/relu_3x3" 245 | type: "ReLU" 246 | bottom: "inception_3a/3x3" 247 | top: "inception_3a/3x3" 248 | } 249 | layer { 250 | name: "inception_3a/5x5_reduce" 251 | type: "Convolution" 252 | bottom: "pool2/3x3_s2" 253 | top: "inception_3a/5x5_reduce" 254 | param { 255 | lr_mult: 0 256 | decay_mult: 0 257 | } 258 | param { 259 | lr_mult: 0 260 | decay_mult: 0 261 | } 262 | convolution_param { 263 | num_output: 16 264 | kernel_size: 1 265 | weight_filler { 266 | type: "xavier" 267 | std: 0.2 268 | } 269 | bias_filler { 270 | type: "constant" 271 | value: 0.2 272 | } 273 | } 274 | } 275 | layer { 276 | name: "inception_3a/relu_5x5_reduce" 277 | type: "ReLU" 278 | bottom: "inception_3a/5x5_reduce" 279 | top: "inception_3a/5x5_reduce" 280 | } 281 | layer { 282 | name: "inception_3a/5x5" 283 | type: "Convolution" 284 | bottom: "inception_3a/5x5_reduce" 285 | top: "inception_3a/5x5" 286 | param { 287 | lr_mult: 0 288 | decay_mult: 0 289 | } 290 | param { 291 | lr_mult: 0 292 | decay_mult: 0 293 | } 294 | convolution_param { 295 | num_output: 32 296 | pad: 2 297 | kernel_size: 5 298 | weight_filler { 299 | type: "xavier" 300 | std: 0.03 301 | } 302 | bias_filler { 303 | type: "constant" 304 | value: 0.2 305 | } 306 | } 307 | } 308 | layer { 309 | name: "inception_3a/relu_5x5" 310 | type: "ReLU" 311 | bottom: "inception_3a/5x5" 312 | top: "inception_3a/5x5" 313 | } 314 | layer { 315 | name: "inception_3a/pool" 316 | type: "Pooling" 317 | bottom: "pool2/3x3_s2" 318 | top: "inception_3a/pool" 319 | pooling_param { 320 | pool: MAX 321 | kernel_size: 3 322 | stride: 1 323 | pad: 1 324 | } 325 | } 326 | layer { 327 | name: "inception_3a/pool_proj" 328 | type: "Convolution" 329 | bottom: "inception_3a/pool" 330 | top: "inception_3a/pool_proj" 331 | param { 332 | lr_mult: 0 333 | decay_mult: 0 334 | } 335 | param { 336 | lr_mult: 0 337 | decay_mult: 0 338 | } 339 | convolution_param { 340 | num_output: 32 341 | kernel_size: 1 342 | weight_filler { 343 | type: "xavier" 344 | std: 0.1 345 | } 346 | bias_filler { 347 | type: "constant" 348 | value: 0.2 349 | } 350 | } 351 | } 352 | layer { 353 | name: "inception_3a/relu_pool_proj" 354 | type: "ReLU" 355 | bottom: "inception_3a/pool_proj" 356 | top: "inception_3a/pool_proj" 357 | } 358 | layer { 359 | name: "inception_3a/output" 360 | type: "Concat" 361 | bottom: "inception_3a/1x1" 362 | bottom: "inception_3a/3x3" 363 | bottom: "inception_3a/5x5" 364 | bottom: "inception_3a/pool_proj" 365 | top: "inception_3a/output" 366 | } 367 | layer { 368 | name: "inception_3b/1x1" 369 | type: "Convolution" 370 | bottom: "inception_3a/output" 371 | top: "inception_3b/1x1" 372 | param { 373 | lr_mult: 0 374 | decay_mult: 0 375 | } 376 | param { 377 | lr_mult: 0 378 | decay_mult: 0 379 | } 380 | convolution_param { 381 | num_output: 128 382 | kernel_size: 1 383 | weight_filler { 384 | type: "xavier" 385 | std: 0.03 386 | } 387 | bias_filler { 388 | type: "constant" 389 | value: 0.2 390 | } 391 | } 392 | } 393 | layer { 394 | name: "inception_3b/relu_1x1" 395 | type: "ReLU" 396 | bottom: "inception_3b/1x1" 397 | top: "inception_3b/1x1" 398 | } 399 | layer { 400 | name: "inception_3b/3x3_reduce" 401 | type: "Convolution" 402 | bottom: "inception_3a/output" 403 | top: "inception_3b/3x3_reduce" 404 | param { 405 | lr_mult: 0 406 | decay_mult: 0 407 | } 408 | param { 409 | lr_mult: 0 410 | decay_mult: 0 411 | } 412 | convolution_param { 413 | num_output: 128 414 | kernel_size: 1 415 | weight_filler { 416 | type: "xavier" 417 | std: 0.09 418 | } 419 | bias_filler { 420 | type: "constant" 421 | value: 0.2 422 | } 423 | } 424 | } 425 | layer { 426 | name: "inception_3b/relu_3x3_reduce" 427 | type: "ReLU" 428 | bottom: "inception_3b/3x3_reduce" 429 | top: "inception_3b/3x3_reduce" 430 | } 431 | layer { 432 | name: "inception_3b/3x3" 433 | type: "Convolution" 434 | bottom: "inception_3b/3x3_reduce" 435 | top: "inception_3b/3x3" 436 | param { 437 | lr_mult: 0 438 | decay_mult: 0 439 | } 440 | param { 441 | lr_mult: 0 442 | decay_mult: 0 443 | } 444 | convolution_param { 445 | num_output: 192 446 | pad: 1 447 | kernel_size: 3 448 | weight_filler { 449 | type: "xavier" 450 | std: 0.03 451 | } 452 | bias_filler { 453 | type: "constant" 454 | value: 0.2 455 | } 456 | } 457 | } 458 | layer { 459 | name: "inception_3b/relu_3x3" 460 | type: "ReLU" 461 | bottom: "inception_3b/3x3" 462 | top: "inception_3b/3x3" 463 | } 464 | layer { 465 | name: "inception_3b/5x5_reduce" 466 | type: "Convolution" 467 | bottom: "inception_3a/output" 468 | top: "inception_3b/5x5_reduce" 469 | param { 470 | lr_mult: 0 471 | decay_mult: 0 472 | } 473 | param { 474 | lr_mult: 0 475 | decay_mult: 0 476 | } 477 | convolution_param { 478 | num_output: 32 479 | kernel_size: 1 480 | weight_filler { 481 | type: "xavier" 482 | std: 0.2 483 | } 484 | bias_filler { 485 | type: "constant" 486 | value: 0.2 487 | } 488 | } 489 | } 490 | layer { 491 | name: "inception_3b/relu_5x5_reduce" 492 | type: "ReLU" 493 | bottom: "inception_3b/5x5_reduce" 494 | top: "inception_3b/5x5_reduce" 495 | } 496 | layer { 497 | name: "inception_3b/5x5" 498 | type: "Convolution" 499 | bottom: "inception_3b/5x5_reduce" 500 | top: "inception_3b/5x5" 501 | param { 502 | lr_mult: 0 503 | decay_mult: 0 504 | } 505 | param { 506 | lr_mult: 0 507 | decay_mult: 0 508 | } 509 | convolution_param { 510 | num_output: 96 511 | pad: 2 512 | kernel_size: 5 513 | weight_filler { 514 | type: "xavier" 515 | std: 0.03 516 | } 517 | bias_filler { 518 | type: "constant" 519 | value: 0.2 520 | } 521 | } 522 | } 523 | layer { 524 | name: "inception_3b/relu_5x5" 525 | type: "ReLU" 526 | bottom: "inception_3b/5x5" 527 | top: "inception_3b/5x5" 528 | } 529 | layer { 530 | name: "inception_3b/pool" 531 | type: "Pooling" 532 | bottom: "inception_3a/output" 533 | top: "inception_3b/pool" 534 | pooling_param { 535 | pool: MAX 536 | kernel_size: 3 537 | stride: 1 538 | pad: 1 539 | } 540 | } 541 | layer { 542 | name: "inception_3b/pool_proj" 543 | type: "Convolution" 544 | bottom: "inception_3b/pool" 545 | top: "inception_3b/pool_proj" 546 | param { 547 | lr_mult: 0 548 | decay_mult: 0 549 | } 550 | param { 551 | lr_mult: 0 552 | decay_mult: 0 553 | } 554 | convolution_param { 555 | num_output: 64 556 | kernel_size: 1 557 | weight_filler { 558 | type: "xavier" 559 | std: 0.1 560 | } 561 | bias_filler { 562 | type: "constant" 563 | value: 0.2 564 | } 565 | } 566 | } 567 | layer { 568 | name: "inception_3b/relu_pool_proj" 569 | type: "ReLU" 570 | bottom: "inception_3b/pool_proj" 571 | top: "inception_3b/pool_proj" 572 | } 573 | layer { 574 | name: "inception_3b/output" 575 | type: "Concat" 576 | bottom: "inception_3b/1x1" 577 | bottom: "inception_3b/3x3" 578 | bottom: "inception_3b/5x5" 579 | bottom: "inception_3b/pool_proj" 580 | top: "inception_3b/output" 581 | } 582 | layer { 583 | name: "pool3/3x3_s2" 584 | type: "Pooling" 585 | bottom: "inception_3b/output" 586 | top: "pool3/3x3_s2" 587 | pooling_param { 588 | pool: MAX 589 | kernel_size: 3 590 | stride: 2 591 | } 592 | } 593 | layer { 594 | name: "inception_4a/1x1" 595 | type: "Convolution" 596 | bottom: "pool3/3x3_s2" 597 | top: "inception_4a/1x1" 598 | param { 599 | lr_mult: 0 600 | decay_mult: 0 601 | } 602 | param { 603 | lr_mult: 0 604 | decay_mult: 0 605 | } 606 | convolution_param { 607 | num_output: 192 608 | kernel_size: 1 609 | weight_filler { 610 | type: "xavier" 611 | std: 0.03 612 | } 613 | bias_filler { 614 | type: "constant" 615 | value: 0.2 616 | } 617 | } 618 | } 619 | layer { 620 | name: "inception_4a/relu_1x1" 621 | type: "ReLU" 622 | bottom: "inception_4a/1x1" 623 | top: "inception_4a/1x1" 624 | } 625 | layer { 626 | name: "inception_4a/3x3_reduce" 627 | type: "Convolution" 628 | bottom: "pool3/3x3_s2" 629 | top: "inception_4a/3x3_reduce" 630 | param { 631 | lr_mult: 0 632 | decay_mult: 0 633 | } 634 | param { 635 | lr_mult: 0 636 | decay_mult: 0 637 | } 638 | convolution_param { 639 | num_output: 96 640 | kernel_size: 1 641 | weight_filler { 642 | type: "xavier" 643 | std: 0.09 644 | } 645 | bias_filler { 646 | type: "constant" 647 | value: 0.2 648 | } 649 | } 650 | } 651 | layer { 652 | name: "inception_4a/relu_3x3_reduce" 653 | type: "ReLU" 654 | bottom: "inception_4a/3x3_reduce" 655 | top: "inception_4a/3x3_reduce" 656 | } 657 | layer { 658 | name: "inception_4a/3x3" 659 | type: "Convolution" 660 | bottom: "inception_4a/3x3_reduce" 661 | top: "inception_4a/3x3" 662 | param { 663 | lr_mult: 0 664 | decay_mult: 0 665 | } 666 | param { 667 | lr_mult: 0 668 | decay_mult: 0 669 | } 670 | convolution_param { 671 | num_output: 208 672 | pad: 1 673 | kernel_size: 3 674 | weight_filler { 675 | type: "xavier" 676 | std: 0.03 677 | } 678 | bias_filler { 679 | type: "constant" 680 | value: 0.2 681 | } 682 | } 683 | } 684 | layer { 685 | name: "inception_4a/relu_3x3" 686 | type: "ReLU" 687 | bottom: "inception_4a/3x3" 688 | top: "inception_4a/3x3" 689 | } 690 | layer { 691 | name: "inception_4a/5x5_reduce" 692 | type: "Convolution" 693 | bottom: "pool3/3x3_s2" 694 | top: "inception_4a/5x5_reduce" 695 | param { 696 | lr_mult: 0 697 | decay_mult: 0 698 | } 699 | param { 700 | lr_mult: 0 701 | decay_mult: 0 702 | } 703 | convolution_param { 704 | num_output: 16 705 | kernel_size: 1 706 | weight_filler { 707 | type: "xavier" 708 | std: 0.2 709 | } 710 | bias_filler { 711 | type: "constant" 712 | value: 0.2 713 | } 714 | } 715 | } 716 | layer { 717 | name: "inception_4a/relu_5x5_reduce" 718 | type: "ReLU" 719 | bottom: "inception_4a/5x5_reduce" 720 | top: "inception_4a/5x5_reduce" 721 | } 722 | layer { 723 | name: "inception_4a/5x5" 724 | type: "Convolution" 725 | bottom: "inception_4a/5x5_reduce" 726 | top: "inception_4a/5x5" 727 | param { 728 | lr_mult: 0 729 | decay_mult: 0 730 | } 731 | param { 732 | lr_mult: 0 733 | decay_mult: 0 734 | } 735 | convolution_param { 736 | num_output: 48 737 | pad: 2 738 | kernel_size: 5 739 | weight_filler { 740 | type: "xavier" 741 | std: 0.03 742 | } 743 | bias_filler { 744 | type: "constant" 745 | value: 0.2 746 | } 747 | } 748 | } 749 | layer { 750 | name: "inception_4a/relu_5x5" 751 | type: "ReLU" 752 | bottom: "inception_4a/5x5" 753 | top: "inception_4a/5x5" 754 | } 755 | layer { 756 | name: "inception_4a/pool" 757 | type: "Pooling" 758 | bottom: "pool3/3x3_s2" 759 | top: "inception_4a/pool" 760 | pooling_param { 761 | pool: MAX 762 | kernel_size: 3 763 | stride: 1 764 | pad: 1 765 | } 766 | } 767 | layer { 768 | name: "inception_4a/pool_proj" 769 | type: "Convolution" 770 | bottom: "inception_4a/pool" 771 | top: "inception_4a/pool_proj" 772 | param { 773 | lr_mult: 0 774 | decay_mult: 0 775 | } 776 | param { 777 | lr_mult: 0 778 | decay_mult: 0 779 | } 780 | convolution_param { 781 | num_output: 64 782 | kernel_size: 1 783 | weight_filler { 784 | type: "xavier" 785 | std: 0.1 786 | } 787 | bias_filler { 788 | type: "constant" 789 | value: 0.2 790 | } 791 | } 792 | } 793 | layer { 794 | name: "inception_4a/relu_pool_proj" 795 | type: "ReLU" 796 | bottom: "inception_4a/pool_proj" 797 | top: "inception_4a/pool_proj" 798 | } 799 | layer { 800 | name: "inception_4a/output" 801 | type: "Concat" 802 | bottom: "inception_4a/1x1" 803 | bottom: "inception_4a/3x3" 804 | bottom: "inception_4a/5x5" 805 | bottom: "inception_4a/pool_proj" 806 | top: "inception_4a/output" 807 | } 808 | layer { 809 | name: "inception_4b/1x1" 810 | type: "Convolution" 811 | bottom: "inception_4a/output" 812 | top: "inception_4b/1x1" 813 | param { 814 | lr_mult: 0 815 | decay_mult: 0 816 | } 817 | param { 818 | lr_mult: 0 819 | decay_mult: 0 820 | } 821 | convolution_param { 822 | num_output: 160 823 | kernel_size: 1 824 | weight_filler { 825 | type: "xavier" 826 | std: 0.03 827 | } 828 | bias_filler { 829 | type: "constant" 830 | value: 0.2 831 | } 832 | } 833 | } 834 | layer { 835 | name: "inception_4b/relu_1x1" 836 | type: "ReLU" 837 | bottom: "inception_4b/1x1" 838 | top: "inception_4b/1x1" 839 | } 840 | layer { 841 | name: "inception_4b/3x3_reduce" 842 | type: "Convolution" 843 | bottom: "inception_4a/output" 844 | top: "inception_4b/3x3_reduce" 845 | param { 846 | lr_mult: 0 847 | decay_mult: 0 848 | } 849 | param { 850 | lr_mult: 0 851 | decay_mult: 0 852 | } 853 | convolution_param { 854 | num_output: 112 855 | kernel_size: 1 856 | weight_filler { 857 | type: "xavier" 858 | std: 0.09 859 | } 860 | bias_filler { 861 | type: "constant" 862 | value: 0.2 863 | } 864 | } 865 | } 866 | layer { 867 | name: "inception_4b/relu_3x3_reduce" 868 | type: "ReLU" 869 | bottom: "inception_4b/3x3_reduce" 870 | top: "inception_4b/3x3_reduce" 871 | } 872 | layer { 873 | name: "inception_4b/3x3" 874 | type: "Convolution" 875 | bottom: "inception_4b/3x3_reduce" 876 | top: "inception_4b/3x3" 877 | param { 878 | lr_mult: 0 879 | decay_mult: 0 880 | } 881 | param { 882 | lr_mult: 0 883 | decay_mult: 0 884 | } 885 | convolution_param { 886 | num_output: 224 887 | pad: 1 888 | kernel_size: 3 889 | weight_filler { 890 | type: "xavier" 891 | std: 0.03 892 | } 893 | bias_filler { 894 | type: "constant" 895 | value: 0.2 896 | } 897 | } 898 | } 899 | layer { 900 | name: "inception_4b/relu_3x3" 901 | type: "ReLU" 902 | bottom: "inception_4b/3x3" 903 | top: "inception_4b/3x3" 904 | } 905 | layer { 906 | name: "inception_4b/5x5_reduce" 907 | type: "Convolution" 908 | bottom: "inception_4a/output" 909 | top: "inception_4b/5x5_reduce" 910 | param { 911 | lr_mult: 0 912 | decay_mult: 0 913 | } 914 | param { 915 | lr_mult: 0 916 | decay_mult: 0 917 | } 918 | convolution_param { 919 | num_output: 24 920 | kernel_size: 1 921 | weight_filler { 922 | type: "xavier" 923 | std: 0.2 924 | } 925 | bias_filler { 926 | type: "constant" 927 | value: 0.2 928 | } 929 | } 930 | } 931 | layer { 932 | name: "inception_4b/relu_5x5_reduce" 933 | type: "ReLU" 934 | bottom: "inception_4b/5x5_reduce" 935 | top: "inception_4b/5x5_reduce" 936 | } 937 | layer { 938 | name: "inception_4b/5x5" 939 | type: "Convolution" 940 | bottom: "inception_4b/5x5_reduce" 941 | top: "inception_4b/5x5" 942 | param { 943 | lr_mult: 0 944 | decay_mult: 0 945 | } 946 | param { 947 | lr_mult: 0 948 | decay_mult: 0 949 | } 950 | convolution_param { 951 | num_output: 64 952 | pad: 2 953 | kernel_size: 5 954 | weight_filler { 955 | type: "xavier" 956 | std: 0.03 957 | } 958 | bias_filler { 959 | type: "constant" 960 | value: 0.2 961 | } 962 | } 963 | } 964 | layer { 965 | name: "inception_4b/relu_5x5" 966 | type: "ReLU" 967 | bottom: "inception_4b/5x5" 968 | top: "inception_4b/5x5" 969 | } 970 | layer { 971 | name: "inception_4b/pool" 972 | type: "Pooling" 973 | bottom: "inception_4a/output" 974 | top: "inception_4b/pool" 975 | pooling_param { 976 | pool: MAX 977 | kernel_size: 3 978 | stride: 1 979 | pad: 1 980 | } 981 | } 982 | layer { 983 | name: "inception_4b/pool_proj" 984 | type: "Convolution" 985 | bottom: "inception_4b/pool" 986 | top: "inception_4b/pool_proj" 987 | param { 988 | lr_mult: 0 989 | decay_mult: 0 990 | } 991 | param { 992 | lr_mult: 0 993 | decay_mult: 0 994 | } 995 | convolution_param { 996 | num_output: 64 997 | kernel_size: 1 998 | weight_filler { 999 | type: "xavier" 1000 | std: 0.1 1001 | } 1002 | bias_filler { 1003 | type: "constant" 1004 | value: 0.2 1005 | } 1006 | } 1007 | } 1008 | layer { 1009 | name: "inception_4b/relu_pool_proj" 1010 | type: "ReLU" 1011 | bottom: "inception_4b/pool_proj" 1012 | top: "inception_4b/pool_proj" 1013 | } 1014 | layer { 1015 | name: "inception_4b/output" 1016 | type: "Concat" 1017 | bottom: "inception_4b/1x1" 1018 | bottom: "inception_4b/3x3" 1019 | bottom: "inception_4b/5x5" 1020 | bottom: "inception_4b/pool_proj" 1021 | top: "inception_4b/output" 1022 | } 1023 | layer { 1024 | name: "inception_4c/1x1" 1025 | type: "Convolution" 1026 | bottom: "inception_4b/output" 1027 | top: "inception_4c/1x1" 1028 | param { 1029 | lr_mult: 0 1030 | decay_mult: 0 1031 | } 1032 | param { 1033 | lr_mult: 0 1034 | decay_mult: 0 1035 | } 1036 | convolution_param { 1037 | num_output: 128 1038 | kernel_size: 1 1039 | weight_filler { 1040 | type: "xavier" 1041 | std: 0.03 1042 | } 1043 | bias_filler { 1044 | type: "constant" 1045 | value: 0.2 1046 | } 1047 | } 1048 | } 1049 | layer { 1050 | name: "inception_4c/relu_1x1" 1051 | type: "ReLU" 1052 | bottom: "inception_4c/1x1" 1053 | top: "inception_4c/1x1" 1054 | } 1055 | layer { 1056 | name: "inception_4c/3x3_reduce" 1057 | type: "Convolution" 1058 | bottom: "inception_4b/output" 1059 | top: "inception_4c/3x3_reduce" 1060 | param { 1061 | lr_mult: 0 1062 | decay_mult: 0 1063 | } 1064 | param { 1065 | lr_mult: 0 1066 | decay_mult: 0 1067 | } 1068 | convolution_param { 1069 | num_output: 128 1070 | kernel_size: 1 1071 | weight_filler { 1072 | type: "xavier" 1073 | std: 0.09 1074 | } 1075 | bias_filler { 1076 | type: "constant" 1077 | value: 0.2 1078 | } 1079 | } 1080 | } 1081 | layer { 1082 | name: "inception_4c/relu_3x3_reduce" 1083 | type: "ReLU" 1084 | bottom: "inception_4c/3x3_reduce" 1085 | top: "inception_4c/3x3_reduce" 1086 | } 1087 | layer { 1088 | name: "inception_4c/3x3" 1089 | type: "Convolution" 1090 | bottom: "inception_4c/3x3_reduce" 1091 | top: "inception_4c/3x3" 1092 | param { 1093 | lr_mult: 0 1094 | decay_mult: 0 1095 | } 1096 | param { 1097 | lr_mult: 0 1098 | decay_mult: 0 1099 | } 1100 | convolution_param { 1101 | num_output: 256 1102 | pad: 1 1103 | kernel_size: 3 1104 | weight_filler { 1105 | type: "xavier" 1106 | std: 0.03 1107 | } 1108 | bias_filler { 1109 | type: "constant" 1110 | value: 0.2 1111 | } 1112 | } 1113 | } 1114 | layer { 1115 | name: "inception_4c/relu_3x3" 1116 | type: "ReLU" 1117 | bottom: "inception_4c/3x3" 1118 | top: "inception_4c/3x3" 1119 | } 1120 | layer { 1121 | name: "inception_4c/5x5_reduce" 1122 | type: "Convolution" 1123 | bottom: "inception_4b/output" 1124 | top: "inception_4c/5x5_reduce" 1125 | param { 1126 | lr_mult: 0 1127 | decay_mult: 0 1128 | } 1129 | param { 1130 | lr_mult: 0 1131 | decay_mult: 0 1132 | } 1133 | convolution_param { 1134 | num_output: 24 1135 | kernel_size: 1 1136 | weight_filler { 1137 | type: "xavier" 1138 | std: 0.2 1139 | } 1140 | bias_filler { 1141 | type: "constant" 1142 | value: 0.2 1143 | } 1144 | } 1145 | } 1146 | layer { 1147 | name: "inception_4c/relu_5x5_reduce" 1148 | type: "ReLU" 1149 | bottom: "inception_4c/5x5_reduce" 1150 | top: "inception_4c/5x5_reduce" 1151 | } 1152 | layer { 1153 | name: "inception_4c/5x5" 1154 | type: "Convolution" 1155 | bottom: "inception_4c/5x5_reduce" 1156 | top: "inception_4c/5x5" 1157 | param { 1158 | lr_mult: 0 1159 | decay_mult: 0 1160 | } 1161 | param { 1162 | lr_mult: 0 1163 | decay_mult: 0 1164 | } 1165 | convolution_param { 1166 | num_output: 64 1167 | pad: 2 1168 | kernel_size: 5 1169 | weight_filler { 1170 | type: "xavier" 1171 | std: 0.03 1172 | } 1173 | bias_filler { 1174 | type: "constant" 1175 | value: 0.2 1176 | } 1177 | } 1178 | } 1179 | layer { 1180 | name: "inception_4c/relu_5x5" 1181 | type: "ReLU" 1182 | bottom: "inception_4c/5x5" 1183 | top: "inception_4c/5x5" 1184 | } 1185 | layer { 1186 | name: "inception_4c/pool" 1187 | type: "Pooling" 1188 | bottom: "inception_4b/output" 1189 | top: "inception_4c/pool" 1190 | pooling_param { 1191 | pool: MAX 1192 | kernel_size: 3 1193 | stride: 1 1194 | pad: 1 1195 | } 1196 | } 1197 | layer { 1198 | name: "inception_4c/pool_proj" 1199 | type: "Convolution" 1200 | bottom: "inception_4c/pool" 1201 | top: "inception_4c/pool_proj" 1202 | param { 1203 | lr_mult: 0 1204 | decay_mult: 0 1205 | } 1206 | param { 1207 | lr_mult: 0 1208 | decay_mult: 0 1209 | } 1210 | convolution_param { 1211 | num_output: 64 1212 | kernel_size: 1 1213 | weight_filler { 1214 | type: "xavier" 1215 | std: 0.1 1216 | } 1217 | bias_filler { 1218 | type: "constant" 1219 | value: 0.2 1220 | } 1221 | } 1222 | } 1223 | layer { 1224 | name: "inception_4c/relu_pool_proj" 1225 | type: "ReLU" 1226 | bottom: "inception_4c/pool_proj" 1227 | top: "inception_4c/pool_proj" 1228 | } 1229 | layer { 1230 | name: "inception_4c/output" 1231 | type: "Concat" 1232 | bottom: "inception_4c/1x1" 1233 | bottom: "inception_4c/3x3" 1234 | bottom: "inception_4c/5x5" 1235 | bottom: "inception_4c/pool_proj" 1236 | top: "inception_4c/output" 1237 | } 1238 | layer { 1239 | name: "inception_4d/1x1" 1240 | type: "Convolution" 1241 | bottom: "inception_4c/output" 1242 | top: "inception_4d/1x1" 1243 | param { 1244 | lr_mult: 0 1245 | decay_mult: 0 1246 | } 1247 | param { 1248 | lr_mult: 0 1249 | decay_mult: 0 1250 | } 1251 | convolution_param { 1252 | num_output: 112 1253 | kernel_size: 1 1254 | weight_filler { 1255 | type: "xavier" 1256 | std: 0.03 1257 | } 1258 | bias_filler { 1259 | type: "constant" 1260 | value: 0.2 1261 | } 1262 | } 1263 | } 1264 | layer { 1265 | name: "inception_4d/relu_1x1" 1266 | type: "ReLU" 1267 | bottom: "inception_4d/1x1" 1268 | top: "inception_4d/1x1" 1269 | } 1270 | layer { 1271 | name: "inception_4d/3x3_reduce" 1272 | type: "Convolution" 1273 | bottom: "inception_4c/output" 1274 | top: "inception_4d/3x3_reduce" 1275 | param { 1276 | lr_mult: 0 1277 | decay_mult: 0 1278 | } 1279 | param { 1280 | lr_mult: 0 1281 | decay_mult: 0 1282 | } 1283 | convolution_param { 1284 | num_output: 144 1285 | kernel_size: 1 1286 | weight_filler { 1287 | type: "xavier" 1288 | std: 0.09 1289 | } 1290 | bias_filler { 1291 | type: "constant" 1292 | value: 0.2 1293 | } 1294 | } 1295 | } 1296 | layer { 1297 | name: "inception_4d/relu_3x3_reduce" 1298 | type: "ReLU" 1299 | bottom: "inception_4d/3x3_reduce" 1300 | top: "inception_4d/3x3_reduce" 1301 | } 1302 | layer { 1303 | name: "inception_4d/3x3" 1304 | type: "Convolution" 1305 | bottom: "inception_4d/3x3_reduce" 1306 | top: "inception_4d/3x3" 1307 | param { 1308 | lr_mult: 0 1309 | decay_mult: 0 1310 | } 1311 | param { 1312 | lr_mult: 0 1313 | decay_mult: 0 1314 | } 1315 | convolution_param { 1316 | num_output: 288 1317 | pad: 1 1318 | kernel_size: 3 1319 | weight_filler { 1320 | type: "xavier" 1321 | std: 0.03 1322 | } 1323 | bias_filler { 1324 | type: "constant" 1325 | value: 0.2 1326 | } 1327 | } 1328 | } 1329 | layer { 1330 | name: "inception_4d/relu_3x3" 1331 | type: "ReLU" 1332 | bottom: "inception_4d/3x3" 1333 | top: "inception_4d/3x3" 1334 | } 1335 | layer { 1336 | name: "inception_4d/5x5_reduce" 1337 | type: "Convolution" 1338 | bottom: "inception_4c/output" 1339 | top: "inception_4d/5x5_reduce" 1340 | param { 1341 | lr_mult: 0 1342 | decay_mult: 0 1343 | } 1344 | param { 1345 | lr_mult: 0 1346 | decay_mult: 0 1347 | } 1348 | convolution_param { 1349 | num_output: 32 1350 | kernel_size: 1 1351 | weight_filler { 1352 | type: "xavier" 1353 | std: 0.2 1354 | } 1355 | bias_filler { 1356 | type: "constant" 1357 | value: 0.2 1358 | } 1359 | } 1360 | } 1361 | layer { 1362 | name: "inception_4d/relu_5x5_reduce" 1363 | type: "ReLU" 1364 | bottom: "inception_4d/5x5_reduce" 1365 | top: "inception_4d/5x5_reduce" 1366 | } 1367 | layer { 1368 | name: "inception_4d/5x5" 1369 | type: "Convolution" 1370 | bottom: "inception_4d/5x5_reduce" 1371 | top: "inception_4d/5x5" 1372 | param { 1373 | lr_mult: 0 1374 | decay_mult: 0 1375 | } 1376 | param { 1377 | lr_mult: 0 1378 | decay_mult: 0 1379 | } 1380 | convolution_param { 1381 | num_output: 64 1382 | pad: 2 1383 | kernel_size: 5 1384 | weight_filler { 1385 | type: "xavier" 1386 | std: 0.03 1387 | } 1388 | bias_filler { 1389 | type: "constant" 1390 | value: 0.2 1391 | } 1392 | } 1393 | } 1394 | layer { 1395 | name: "inception_4d/relu_5x5" 1396 | type: "ReLU" 1397 | bottom: "inception_4d/5x5" 1398 | top: "inception_4d/5x5" 1399 | } 1400 | layer { 1401 | name: "inception_4d/pool" 1402 | type: "Pooling" 1403 | bottom: "inception_4c/output" 1404 | top: "inception_4d/pool" 1405 | pooling_param { 1406 | pool: MAX 1407 | kernel_size: 3 1408 | stride: 1 1409 | pad: 1 1410 | } 1411 | } 1412 | layer { 1413 | name: "inception_4d/pool_proj" 1414 | type: "Convolution" 1415 | bottom: "inception_4d/pool" 1416 | top: "inception_4d/pool_proj" 1417 | param { 1418 | lr_mult: 0 1419 | decay_mult: 0 1420 | } 1421 | param { 1422 | lr_mult: 0 1423 | decay_mult: 0 1424 | } 1425 | convolution_param { 1426 | num_output: 64 1427 | kernel_size: 1 1428 | weight_filler { 1429 | type: "xavier" 1430 | std: 0.1 1431 | } 1432 | bias_filler { 1433 | type: "constant" 1434 | value: 0.2 1435 | } 1436 | } 1437 | } 1438 | layer { 1439 | name: "inception_4d/relu_pool_proj" 1440 | type: "ReLU" 1441 | bottom: "inception_4d/pool_proj" 1442 | top: "inception_4d/pool_proj" 1443 | } 1444 | layer { 1445 | name: "inception_4d/output" 1446 | type: "Concat" 1447 | bottom: "inception_4d/1x1" 1448 | bottom: "inception_4d/3x3" 1449 | bottom: "inception_4d/5x5" 1450 | bottom: "inception_4d/pool_proj" 1451 | top: "inception_4d/output" 1452 | } 1453 | layer { 1454 | name: "inception_4e/1x1" 1455 | type: "Convolution" 1456 | bottom: "inception_4d/output" 1457 | top: "inception_4e/1x1" 1458 | param { 1459 | lr_mult: 0 1460 | decay_mult: 0 1461 | } 1462 | param { 1463 | lr_mult: 0 1464 | decay_mult: 0 1465 | } 1466 | convolution_param { 1467 | num_output: 256 1468 | kernel_size: 1 1469 | weight_filler { 1470 | type: "xavier" 1471 | std: 0.03 1472 | } 1473 | bias_filler { 1474 | type: "constant" 1475 | value: 0.2 1476 | } 1477 | } 1478 | } 1479 | layer { 1480 | name: "inception_4e/relu_1x1" 1481 | type: "ReLU" 1482 | bottom: "inception_4e/1x1" 1483 | top: "inception_4e/1x1" 1484 | } 1485 | layer { 1486 | name: "inception_4e/3x3_reduce" 1487 | type: "Convolution" 1488 | bottom: "inception_4d/output" 1489 | top: "inception_4e/3x3_reduce" 1490 | param { 1491 | lr_mult: 0 1492 | decay_mult: 0 1493 | } 1494 | param { 1495 | lr_mult: 0 1496 | decay_mult: 0 1497 | } 1498 | convolution_param { 1499 | num_output: 160 1500 | kernel_size: 1 1501 | weight_filler { 1502 | type: "xavier" 1503 | std: 0.09 1504 | } 1505 | bias_filler { 1506 | type: "constant" 1507 | value: 0.2 1508 | } 1509 | } 1510 | } 1511 | layer { 1512 | name: "inception_4e/relu_3x3_reduce" 1513 | type: "ReLU" 1514 | bottom: "inception_4e/3x3_reduce" 1515 | top: "inception_4e/3x3_reduce" 1516 | } 1517 | layer { 1518 | name: "inception_4e/3x3" 1519 | type: "Convolution" 1520 | bottom: "inception_4e/3x3_reduce" 1521 | top: "inception_4e/3x3" 1522 | param { 1523 | lr_mult: 0 1524 | decay_mult: 0 1525 | } 1526 | param { 1527 | lr_mult: 0 1528 | decay_mult: 0 1529 | } 1530 | convolution_param { 1531 | num_output: 320 1532 | pad: 1 1533 | kernel_size: 3 1534 | weight_filler { 1535 | type: "xavier" 1536 | std: 0.03 1537 | } 1538 | bias_filler { 1539 | type: "constant" 1540 | value: 0.2 1541 | } 1542 | } 1543 | } 1544 | layer { 1545 | name: "inception_4e/relu_3x3" 1546 | type: "ReLU" 1547 | bottom: "inception_4e/3x3" 1548 | top: "inception_4e/3x3" 1549 | } 1550 | layer { 1551 | name: "inception_4e/5x5_reduce" 1552 | type: "Convolution" 1553 | bottom: "inception_4d/output" 1554 | top: "inception_4e/5x5_reduce" 1555 | param { 1556 | lr_mult: 0 1557 | decay_mult: 0 1558 | } 1559 | param { 1560 | lr_mult: 0 1561 | decay_mult: 0 1562 | } 1563 | convolution_param { 1564 | num_output: 32 1565 | kernel_size: 1 1566 | weight_filler { 1567 | type: "xavier" 1568 | std: 0.2 1569 | } 1570 | bias_filler { 1571 | type: "constant" 1572 | value: 0.2 1573 | } 1574 | } 1575 | } 1576 | layer { 1577 | name: "inception_4e/relu_5x5_reduce" 1578 | type: "ReLU" 1579 | bottom: "inception_4e/5x5_reduce" 1580 | top: "inception_4e/5x5_reduce" 1581 | } 1582 | layer { 1583 | name: "inception_4e/5x5" 1584 | type: "Convolution" 1585 | bottom: "inception_4e/5x5_reduce" 1586 | top: "inception_4e/5x5" 1587 | param { 1588 | lr_mult: 0 1589 | decay_mult: 0 1590 | } 1591 | param { 1592 | lr_mult: 0 1593 | decay_mult: 0 1594 | } 1595 | convolution_param { 1596 | num_output: 128 1597 | pad: 2 1598 | kernel_size: 5 1599 | weight_filler { 1600 | type: "xavier" 1601 | std: 0.03 1602 | } 1603 | bias_filler { 1604 | type: "constant" 1605 | value: 0.2 1606 | } 1607 | } 1608 | } 1609 | layer { 1610 | name: "inception_4e/relu_5x5" 1611 | type: "ReLU" 1612 | bottom: "inception_4e/5x5" 1613 | top: "inception_4e/5x5" 1614 | } 1615 | layer { 1616 | name: "inception_4e/pool" 1617 | type: "Pooling" 1618 | bottom: "inception_4d/output" 1619 | top: "inception_4e/pool" 1620 | pooling_param { 1621 | pool: MAX 1622 | kernel_size: 3 1623 | stride: 1 1624 | pad: 1 1625 | } 1626 | } 1627 | layer { 1628 | name: "inception_4e/pool_proj" 1629 | type: "Convolution" 1630 | bottom: "inception_4e/pool" 1631 | top: "inception_4e/pool_proj" 1632 | param { 1633 | lr_mult: 0 1634 | decay_mult: 0 1635 | } 1636 | param { 1637 | lr_mult: 0 1638 | decay_mult: 0 1639 | } 1640 | convolution_param { 1641 | num_output: 128 1642 | kernel_size: 1 1643 | weight_filler { 1644 | type: "xavier" 1645 | std: 0.1 1646 | } 1647 | bias_filler { 1648 | type: "constant" 1649 | value: 0.2 1650 | } 1651 | } 1652 | } 1653 | layer { 1654 | name: "inception_4e/relu_pool_proj" 1655 | type: "ReLU" 1656 | bottom: "inception_4e/pool_proj" 1657 | top: "inception_4e/pool_proj" 1658 | } 1659 | layer { 1660 | name: "inception_4e/output" 1661 | type: "Concat" 1662 | bottom: "inception_4e/1x1" 1663 | bottom: "inception_4e/3x3" 1664 | bottom: "inception_4e/5x5" 1665 | bottom: "inception_4e/pool_proj" 1666 | top: "inception_4e/output" 1667 | } 1668 | layer { 1669 | name: "pool4/3x3_s2" 1670 | type: "Pooling" 1671 | bottom: "inception_4e/output" 1672 | top: "pool4/3x3_s2" 1673 | pooling_param { 1674 | pool: MAX 1675 | kernel_size: 3 1676 | stride: 2 1677 | } 1678 | } 1679 | layer { 1680 | name: "inception_5a/1x1" 1681 | type: "Convolution" 1682 | bottom: "pool4/3x3_s2" 1683 | top: "inception_5a/1x1" 1684 | param { 1685 | lr_mult: 0 1686 | decay_mult: 0 1687 | } 1688 | param { 1689 | lr_mult: 0 1690 | decay_mult: 0 1691 | } 1692 | convolution_param { 1693 | num_output: 256 1694 | kernel_size: 1 1695 | weight_filler { 1696 | type: "xavier" 1697 | std: 0.03 1698 | } 1699 | bias_filler { 1700 | type: "constant" 1701 | value: 0.2 1702 | } 1703 | } 1704 | } 1705 | layer { 1706 | name: "inception_5a/relu_1x1" 1707 | type: "ReLU" 1708 | bottom: "inception_5a/1x1" 1709 | top: "inception_5a/1x1" 1710 | } 1711 | layer { 1712 | name: "inception_5a/3x3_reduce" 1713 | type: "Convolution" 1714 | bottom: "pool4/3x3_s2" 1715 | top: "inception_5a/3x3_reduce" 1716 | param { 1717 | lr_mult: 0 1718 | decay_mult: 0 1719 | } 1720 | param { 1721 | lr_mult: 0 1722 | decay_mult: 0 1723 | } 1724 | convolution_param { 1725 | num_output: 160 1726 | kernel_size: 1 1727 | weight_filler { 1728 | type: "xavier" 1729 | std: 0.09 1730 | } 1731 | bias_filler { 1732 | type: "constant" 1733 | value: 0.2 1734 | } 1735 | } 1736 | } 1737 | layer { 1738 | name: "inception_5a/relu_3x3_reduce" 1739 | type: "ReLU" 1740 | bottom: "inception_5a/3x3_reduce" 1741 | top: "inception_5a/3x3_reduce" 1742 | } 1743 | layer { 1744 | name: "inception_5a/3x3" 1745 | type: "Convolution" 1746 | bottom: "inception_5a/3x3_reduce" 1747 | top: "inception_5a/3x3" 1748 | param { 1749 | lr_mult: 0 1750 | decay_mult: 0 1751 | } 1752 | param { 1753 | lr_mult: 0 1754 | decay_mult: 0 1755 | } 1756 | convolution_param { 1757 | num_output: 320 1758 | pad: 1 1759 | kernel_size: 3 1760 | weight_filler { 1761 | type: "xavier" 1762 | std: 0.03 1763 | } 1764 | bias_filler { 1765 | type: "constant" 1766 | value: 0.2 1767 | } 1768 | } 1769 | } 1770 | layer { 1771 | name: "inception_5a/relu_3x3" 1772 | type: "ReLU" 1773 | bottom: "inception_5a/3x3" 1774 | top: "inception_5a/3x3" 1775 | } 1776 | layer { 1777 | name: "inception_5a/5x5_reduce" 1778 | type: "Convolution" 1779 | bottom: "pool4/3x3_s2" 1780 | top: "inception_5a/5x5_reduce" 1781 | param { 1782 | lr_mult: 0 1783 | decay_mult: 0 1784 | } 1785 | param { 1786 | lr_mult: 0 1787 | decay_mult: 0 1788 | } 1789 | convolution_param { 1790 | num_output: 32 1791 | kernel_size: 1 1792 | weight_filler { 1793 | type: "xavier" 1794 | std: 0.2 1795 | } 1796 | bias_filler { 1797 | type: "constant" 1798 | value: 0.2 1799 | } 1800 | } 1801 | } 1802 | layer { 1803 | name: "inception_5a/relu_5x5_reduce" 1804 | type: "ReLU" 1805 | bottom: "inception_5a/5x5_reduce" 1806 | top: "inception_5a/5x5_reduce" 1807 | } 1808 | layer { 1809 | name: "inception_5a/5x5" 1810 | type: "Convolution" 1811 | bottom: "inception_5a/5x5_reduce" 1812 | top: "inception_5a/5x5" 1813 | param { 1814 | lr_mult: 0 1815 | decay_mult: 0 1816 | } 1817 | param { 1818 | lr_mult: 0 1819 | decay_mult: 0 1820 | } 1821 | convolution_param { 1822 | num_output: 128 1823 | pad: 2 1824 | kernel_size: 5 1825 | weight_filler { 1826 | type: "xavier" 1827 | std: 0.03 1828 | } 1829 | bias_filler { 1830 | type: "constant" 1831 | value: 0.2 1832 | } 1833 | } 1834 | } 1835 | layer { 1836 | name: "inception_5a/relu_5x5" 1837 | type: "ReLU" 1838 | bottom: "inception_5a/5x5" 1839 | top: "inception_5a/5x5" 1840 | } 1841 | layer { 1842 | name: "inception_5a/pool" 1843 | type: "Pooling" 1844 | bottom: "pool4/3x3_s2" 1845 | top: "inception_5a/pool" 1846 | pooling_param { 1847 | pool: MAX 1848 | kernel_size: 3 1849 | stride: 1 1850 | pad: 1 1851 | } 1852 | } 1853 | layer { 1854 | name: "inception_5a/pool_proj" 1855 | type: "Convolution" 1856 | bottom: "inception_5a/pool" 1857 | top: "inception_5a/pool_proj" 1858 | param { 1859 | lr_mult: 0 1860 | decay_mult: 0 1861 | } 1862 | param { 1863 | lr_mult: 0 1864 | decay_mult: 0 1865 | } 1866 | convolution_param { 1867 | num_output: 128 1868 | kernel_size: 1 1869 | weight_filler { 1870 | type: "xavier" 1871 | std: 0.1 1872 | } 1873 | bias_filler { 1874 | type: "constant" 1875 | value: 0.2 1876 | } 1877 | } 1878 | } 1879 | layer { 1880 | name: "inception_5a/relu_pool_proj" 1881 | type: "ReLU" 1882 | bottom: "inception_5a/pool_proj" 1883 | top: "inception_5a/pool_proj" 1884 | } 1885 | layer { 1886 | name: "inception_5a/output" 1887 | type: "Concat" 1888 | bottom: "inception_5a/1x1" 1889 | bottom: "inception_5a/3x3" 1890 | bottom: "inception_5a/5x5" 1891 | bottom: "inception_5a/pool_proj" 1892 | top: "inception_5a/output" 1893 | } 1894 | layer { 1895 | name: "inception_5b/1x1" 1896 | type: "Convolution" 1897 | bottom: "inception_5a/output" 1898 | top: "inception_5b/1x1" 1899 | param { 1900 | lr_mult: 0 1901 | decay_mult: 0 1902 | } 1903 | param { 1904 | lr_mult: 0 1905 | decay_mult: 0 1906 | } 1907 | convolution_param { 1908 | num_output: 384 1909 | kernel_size: 1 1910 | weight_filler { 1911 | type: "xavier" 1912 | std: 0.03 1913 | } 1914 | bias_filler { 1915 | type: "constant" 1916 | value: 0.2 1917 | } 1918 | } 1919 | } 1920 | layer { 1921 | name: "inception_5b/relu_1x1" 1922 | type: "ReLU" 1923 | bottom: "inception_5b/1x1" 1924 | top: "inception_5b/1x1" 1925 | } 1926 | layer { 1927 | name: "inception_5b/3x3_reduce" 1928 | type: "Convolution" 1929 | bottom: "inception_5a/output" 1930 | top: "inception_5b/3x3_reduce" 1931 | param { 1932 | lr_mult: 0 1933 | decay_mult: 0 1934 | } 1935 | param { 1936 | lr_mult: 0 1937 | decay_mult: 0 1938 | } 1939 | convolution_param { 1940 | num_output: 192 1941 | kernel_size: 1 1942 | weight_filler { 1943 | type: "xavier" 1944 | std: 0.09 1945 | } 1946 | bias_filler { 1947 | type: "constant" 1948 | value: 0.2 1949 | } 1950 | } 1951 | } 1952 | layer { 1953 | name: "inception_5b/relu_3x3_reduce" 1954 | type: "ReLU" 1955 | bottom: "inception_5b/3x3_reduce" 1956 | top: "inception_5b/3x3_reduce" 1957 | } 1958 | layer { 1959 | name: "inception_5b/3x3" 1960 | type: "Convolution" 1961 | bottom: "inception_5b/3x3_reduce" 1962 | top: "inception_5b/3x3" 1963 | param { 1964 | lr_mult: 0 1965 | decay_mult: 0 1966 | } 1967 | param { 1968 | lr_mult: 0 1969 | decay_mult: 0 1970 | } 1971 | convolution_param { 1972 | num_output: 384 1973 | pad: 1 1974 | kernel_size: 3 1975 | weight_filler { 1976 | type: "xavier" 1977 | std: 0.03 1978 | } 1979 | bias_filler { 1980 | type: "constant" 1981 | value: 0.2 1982 | } 1983 | } 1984 | } 1985 | layer { 1986 | name: "inception_5b/relu_3x3" 1987 | type: "ReLU" 1988 | bottom: "inception_5b/3x3" 1989 | top: "inception_5b/3x3" 1990 | } 1991 | layer { 1992 | name: "inception_5b/5x5_reduce" 1993 | type: "Convolution" 1994 | bottom: "inception_5a/output" 1995 | top: "inception_5b/5x5_reduce" 1996 | param { 1997 | lr_mult: 0 1998 | decay_mult: 0 1999 | } 2000 | param { 2001 | lr_mult: 0 2002 | decay_mult: 0 2003 | } 2004 | convolution_param { 2005 | num_output: 48 2006 | kernel_size: 1 2007 | weight_filler { 2008 | type: "xavier" 2009 | std: 0.2 2010 | } 2011 | bias_filler { 2012 | type: "constant" 2013 | value: 0.2 2014 | } 2015 | } 2016 | } 2017 | layer { 2018 | name: "inception_5b/relu_5x5_reduce" 2019 | type: "ReLU" 2020 | bottom: "inception_5b/5x5_reduce" 2021 | top: "inception_5b/5x5_reduce" 2022 | } 2023 | layer { 2024 | name: "inception_5b/5x5" 2025 | type: "Convolution" 2026 | bottom: "inception_5b/5x5_reduce" 2027 | top: "inception_5b/5x5" 2028 | param { 2029 | lr_mult: 0 2030 | decay_mult: 0 2031 | } 2032 | param { 2033 | lr_mult: 0 2034 | decay_mult: 0 2035 | } 2036 | convolution_param { 2037 | num_output: 128 2038 | pad: 2 2039 | kernel_size: 5 2040 | weight_filler { 2041 | type: "xavier" 2042 | std: 0.03 2043 | } 2044 | bias_filler { 2045 | type: "constant" 2046 | value: 0.2 2047 | } 2048 | } 2049 | } 2050 | layer { 2051 | name: "inception_5b/relu_5x5" 2052 | type: "ReLU" 2053 | bottom: "inception_5b/5x5" 2054 | top: "inception_5b/5x5" 2055 | } 2056 | layer { 2057 | name: "inception_5b/pool" 2058 | type: "Pooling" 2059 | bottom: "inception_5a/output" 2060 | top: "inception_5b/pool" 2061 | pooling_param { 2062 | pool: MAX 2063 | kernel_size: 3 2064 | stride: 1 2065 | pad: 1 2066 | } 2067 | } 2068 | layer { 2069 | name: "inception_5b/pool_proj" 2070 | type: "Convolution" 2071 | bottom: "inception_5b/pool" 2072 | top: "inception_5b/pool_proj" 2073 | param { 2074 | lr_mult: 0 2075 | decay_mult: 0 2076 | } 2077 | param { 2078 | lr_mult: 0 2079 | decay_mult: 0 2080 | } 2081 | convolution_param { 2082 | num_output: 128 2083 | kernel_size: 1 2084 | weight_filler { 2085 | type: "xavier" 2086 | std: 0.1 2087 | } 2088 | bias_filler { 2089 | type: "constant" 2090 | value: 0.2 2091 | } 2092 | } 2093 | } 2094 | layer { 2095 | name: "inception_5b/relu_pool_proj" 2096 | type: "ReLU" 2097 | bottom: "inception_5b/pool_proj" 2098 | top: "inception_5b/pool_proj" 2099 | } 2100 | layer { 2101 | name: "inception_5b/output" 2102 | type: "Concat" 2103 | bottom: "inception_5b/1x1" 2104 | bottom: "inception_5b/3x3" 2105 | bottom: "inception_5b/5x5" 2106 | bottom: "inception_5b/pool_proj" 2107 | top: "inception_5b/output" 2108 | } 2109 | layer { 2110 | name: "pool5/7x7_s1" 2111 | type: "Pooling" 2112 | bottom: "inception_5b/output" 2113 | top: "pool5/7x7_s1" 2114 | pooling_param { 2115 | pool: AVE 2116 | kernel_size: 7 2117 | stride: 1 2118 | } 2119 | } 2120 | layer { 2121 | name: "pool5/drop_7x7_s1" 2122 | type: "Dropout" 2123 | bottom: "pool5/7x7_s1" 2124 | top: "pool5/7x7_s1" 2125 | dropout_param { 2126 | dropout_ratio: 0.4 2127 | } 2128 | } 2129 | layer { 2130 | name: "loss3/classifier_indus" 2131 | type: "InnerProduct" 2132 | bottom: "pool5/7x7_s1" 2133 | top: "loss3/classifier_indus" 2134 | param { 2135 | lr_mult: 0 2136 | decay_mult: 0 2137 | } 2138 | param { 2139 | lr_mult: 0 2140 | decay_mult: 0 2141 | } 2142 | inner_product_param { 2143 | num_output: 3 2144 | weight_filler { 2145 | type: "xavier" 2146 | } 2147 | bias_filler { 2148 | type: "constant" 2149 | value: 0 2150 | } 2151 | } 2152 | } 2153 | layer { 2154 | name: "prob" 2155 | type: "Softmax" 2156 | bottom: "loss3/classifier_indus" 2157 | top: "prob" 2158 | } 2159 | -------------------------------------------------------------------------------- /lib/models/text-notext/weights.caffemodel: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:25ee822afe55a12ddc9f8d3d4d39bb050cfa5b12caff3bcda22c4beb486f0488 3 | size 41272197 4 | -------------------------------------------------------------------------------- /lib/selectivesearch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2015-2016 AlpacaDB 6 | Copyright (c) 2016 Oussama ENNAFII 7 | ''' 8 | 9 | import skimage.io 10 | import skimage.feature 11 | import skimage.color 12 | import skimage.transform 13 | import skimage.util 14 | import skimage.segmentation 15 | import numpy 16 | 17 | 18 | # "Selective Search for Object Recognition" by J.R.R. Uijlings et al. 19 | # 20 | # - Modified version with LBP extractor for texture vectorization 21 | 22 | 23 | def _generate_segments(im_orig, scale, sigma, min_size): 24 | """ 25 | segment smallest regions by the algorithm of Felzenswalb and 26 | Huttenlocher 27 | """ 28 | 29 | # open the Image 30 | im_mask = skimage.segmentation.felzenszwalb( 31 | skimage.util.img_as_float(im_orig), scale=scale, sigma=sigma, 32 | min_size=min_size) 33 | 34 | # merge mask channel to the image as a 4th channel 35 | im_orig = numpy.append( 36 | im_orig, numpy.zeros(im_orig.shape[:2])[:, :, numpy.newaxis], axis=2) 37 | im_orig[:, :, 3] = im_mask 38 | 39 | return im_orig 40 | 41 | 42 | def _sim_colour(r1, r2): 43 | """ 44 | calculate the sum of histogram intersection of colour 45 | """ 46 | return sum([min(a, b) for a, b in zip(r1["hist_c"], r2["hist_c"])]) 47 | 48 | 49 | def _sim_texture(r1, r2): 50 | """ 51 | calculate the sum of histogram intersection of texture 52 | """ 53 | return sum([min(a, b) for a, b in zip(r1["hist_t"], r2["hist_t"])]) 54 | 55 | 56 | def _sim_size(r1, r2, imsize): 57 | """ 58 | calculate the size similarity over the image 59 | """ 60 | return 1.0 - (r1["size"] + r2["size"]) / imsize 61 | 62 | 63 | def _sim_fill(r1, r2, imsize): 64 | """ 65 | calculate the fill similarity over the image 66 | """ 67 | bbsize = ( 68 | (max(r1["max_x"], r2["max_x"]) - min(r1["min_x"], r2["min_x"])) 69 | * (max(r1["max_y"], r2["max_y"]) - min(r1["min_y"], r2["min_y"])) 70 | ) 71 | return 1.0 - (bbsize - r1["size"] - r2["size"]) / imsize 72 | 73 | 74 | def _calc_sim(r1, r2, imsize): 75 | return (_sim_colour(r1, r2) + _sim_texture(r1, r2) 76 | + _sim_size(r1, r2, imsize) + _sim_fill(r1, r2, imsize)) 77 | 78 | 79 | def _calc_colour_hist(img): 80 | """ 81 | calculate colour histogram for each region 82 | 83 | the size of output histogram will be BINS * COLOUR_CHANNELS(3) 84 | 85 | number of bins is 25 as same as [uijlings_ijcv2013_draft.pdf] 86 | 87 | extract HSV 88 | """ 89 | 90 | BINS = 25 91 | hist = numpy.array([]) 92 | 93 | for colour_channel in (0, 1, 2): 94 | 95 | # extracting one colour channel 96 | c = img[:, colour_channel] 97 | 98 | # calculate histogram for each colour and join to the result 99 | hist = numpy.concatenate( 100 | [hist] + [numpy.histogram(c, BINS, (0.0, 255.0))[0]]) 101 | 102 | # L1 normalize 103 | hist = hist / len(img) 104 | 105 | return hist 106 | 107 | 108 | def _calc_texture_gradient(img): 109 | """ 110 | calculate texture gradient for entire image 111 | 112 | The original SelectiveSearch algorithm proposed Gaussian derivative 113 | for 8 orientations, but we use LBP instead. 114 | 115 | output will be [height(*)][width(*)] 116 | """ 117 | ret = numpy.zeros((img.shape[0], img.shape[1], img.shape[2])) 118 | 119 | for colour_channel in (0, 1, 2): 120 | ret[:, :, colour_channel] = skimage.feature.local_binary_pattern( 121 | img[:, :, colour_channel], 8, 1.0) 122 | 123 | return ret 124 | 125 | 126 | def _calc_texture_hist(img): 127 | """ 128 | calculate texture histogram for each region 129 | 130 | calculate the histogram of gradient for each colours 131 | the size of output histogram will be 132 | BINS * ORIENTATIONS * COLOUR_CHANNELS(3) 133 | """ 134 | BINS = 10 135 | 136 | hist = numpy.array([]) 137 | 138 | for colour_channel in (0, 1, 2): 139 | 140 | # mask by the colour channel 141 | fd = img[:, colour_channel] 142 | 143 | # calculate histogram for each orientation and concatenate them all 144 | # and join to the result 145 | hist = numpy.concatenate( 146 | [hist] + [numpy.histogram(fd, BINS, (0.0, 1.0))[0]]) 147 | 148 | # L1 Normalize 149 | hist = hist / len(img) 150 | 151 | return hist 152 | 153 | 154 | def _extract_regions(img): 155 | 156 | R = {} 157 | 158 | # get hsv image 159 | hsv = skimage.color.rgb2hsv(img[:, :, :3]) 160 | 161 | # pass 1: count pixel positions 162 | for y, i in enumerate(img): 163 | 164 | for x, (r, g, b, l) in enumerate(i): 165 | 166 | # initialize a new region 167 | if l not in R: 168 | R[l] = { 169 | "min_x": 0xffff, "min_y": 0xffff, 170 | "max_x": 0, "max_y": 0, "labels": [l]} 171 | 172 | # bounding box 173 | if R[l]["min_x"] > x: 174 | R[l]["min_x"] = x 175 | if R[l]["min_y"] > y: 176 | R[l]["min_y"] = y 177 | if R[l]["max_x"] < x: 178 | R[l]["max_x"] = x 179 | if R[l]["max_y"] < y: 180 | R[l]["max_y"] = y 181 | 182 | # pass 2: calculate texture gradient 183 | tex_grad = _calc_texture_gradient(img) 184 | 185 | # pass 3: calculate colour histogram of each region 186 | for k, v in R.items(): 187 | 188 | # colour histogram 189 | masked_pixels = hsv[:, :, :][img[:, :, 3] == k] 190 | R[k]["size"] = len(masked_pixels / 4) 191 | R[k]["hist_c"] = _calc_colour_hist(masked_pixels) 192 | 193 | # texture histogram 194 | R[k]["hist_t"] = _calc_texture_hist(tex_grad[:, :][img[:, :, 3] == k]) 195 | 196 | return R 197 | 198 | 199 | def _extract_neighbours(regions): 200 | 201 | def intersect(a, b): 202 | if (a["min_x"] < b["min_x"] < a["max_x"] 203 | and a["min_y"] < b["min_y"] < a["max_y"]) or ( 204 | a["min_x"] < b["max_x"] < a["max_x"] 205 | and a["min_y"] < b["max_y"] < a["max_y"]) or ( 206 | a["min_x"] < b["min_x"] < a["max_x"] 207 | and a["min_y"] < b["max_y"] < a["max_y"]) or ( 208 | a["min_x"] < b["min_x"] < a["max_x"] 209 | and a["min_y"] < b["max_y"] < a["max_y"]): 210 | return True 211 | return False 212 | 213 | R = list(regions.items()) 214 | 215 | neighbours = [] 216 | for cur, a in enumerate(R[:-1]): 217 | for b in R[int(cur) + 1:]: 218 | if intersect(a[1], b[1]): 219 | neighbours.append((a, b)) 220 | 221 | return neighbours 222 | 223 | 224 | def _merge_regions(r1, r2): 225 | new_size = r1["size"] + r2["size"] 226 | rt = { 227 | "min_x": min(r1["min_x"], r2["min_x"]), 228 | "min_y": min(r1["min_y"], r2["min_y"]), 229 | "max_x": max(r1["max_x"], r2["max_x"]), 230 | "max_y": max(r1["max_y"], r2["max_y"]), 231 | "size": new_size, 232 | "hist_c": ( 233 | r1["hist_c"] * r1["size"] + r2["hist_c"] * r2["size"]) / new_size, 234 | "hist_t": ( 235 | r1["hist_t"] * r1["size"] + r2["hist_t"] * r2["size"]) / new_size, 236 | "labels": r1["labels"] + r2["labels"] 237 | } 238 | return rt 239 | 240 | 241 | def selective_search( 242 | im_orig, scale=1.0, sigma=0.8, min_size=50): 243 | '''Selective Search 244 | 245 | Parameters 246 | ---------- 247 | im_orig : ndarray 248 | Input image 249 | scale : int 250 | Free parameter. Higher means larger clusters in felzenszwalb segmentation. Inverse relation with num pixels. 251 | sigma : float 252 | Width of Gaussian kernel for felzenszwalb segmentation. 253 | min_size : int 254 | Minimum component size for felzenszwalb segmentation. 255 | Returns 256 | ------- 257 | img : ndarray 258 | image with region label 259 | region label is stored in the 4th value of each pixel [r,g,b,(region)] 260 | regions : array of dict 261 | [ 262 | { 263 | 'rect': (left, top, right, bottom), 264 | 'labels': [...] 265 | }, 266 | ... 267 | ] 268 | ''' 269 | assert im_orig.shape[2] == 3, "3ch image is expected" 270 | 271 | # load image and get smallest regions 272 | # region label is stored in the 4th value of each pixel [r,g,b,(region)] 273 | img = _generate_segments(im_orig, scale, sigma, min_size) 274 | 275 | if img is None: 276 | return None, {} 277 | 278 | imsize = img.shape[0] * img.shape[1] 279 | R = _extract_regions(img) 280 | 281 | # extract neighbouring information 282 | neighbours = _extract_neighbours(R) 283 | 284 | # calculate initial similarities 285 | S = {} 286 | for (ai, ar), (bi, br) in neighbours: 287 | S[(ai, bi)] = _calc_sim(ar, br, imsize) 288 | 289 | # hierarchal search 290 | while S != {}: 291 | 292 | # get highest similarity 293 | i, j = sorted(list(S.items()), key=lambda tup: tup[1])[-1][0] 294 | 295 | # merge corresponding regions 296 | t = max(R.keys()) + 1.0 297 | R[t] = _merge_regions(R[i], R[j]) 298 | 299 | # mark similarities for regions to be removed 300 | key_to_delete = [] 301 | for k, v in S.items(): 302 | if (i in k) or (j in k): 303 | key_to_delete.append(k) 304 | 305 | # remove old similarities of related regions 306 | for k in key_to_delete: 307 | del S[k] 308 | 309 | # calculate similarity set with the new region 310 | for k in filter(lambda a: a != (i, j), key_to_delete): 311 | n = k[1] if k[0] in (i, j) else k[0] 312 | S[(t, n)] = _calc_sim(R[t], R[n], imsize) 313 | 314 | regions = [] 315 | for k, r in R.items(): 316 | regions.append({ 317 | 'rect': ( 318 | r['min_x'], r['min_y'], 319 | r['max_x'] - r['min_x'], r['max_y'] - r['min_y']), 320 | 'size': r['size'], 321 | 'labels': r['labels'] 322 | }) 323 | 324 | return img, regions 325 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-json-logger==0.1.5 2 | -------------------------------------------------------------------------------- /set_env.sh: -------------------------------------------------------------------------------- 1 | # environment variables 2 | 3 | # paths 4 | export CAFFE_PATH=/root/caffe/build/tools/caffe 5 | export TEXT_NOTEXT_MODELS_DIR=./lib/models/text-notext 6 | export JAR_NOJAR_MODELS_DIR=./lib/models/jar-nojar 7 | 8 | # gpu computations switch (1-> GPU Computations and 0-> CPU Computations) 9 | export IS_GPU=1 10 | 11 | # set the logger level (10 -> DEBUG, 20 -> INFO) 12 | export LOG_LEVEL=10 13 | 14 | # suppress caffe logs 15 | # export GLOG_minloglevel=2 16 | -------------------------------------------------------------------------------- /stages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/stages/__init__.py -------------------------------------------------------------------------------- /stages/region_proposal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/stages/region_proposal/__init__.py -------------------------------------------------------------------------------- /stages/region_proposal/extract_seal.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cv2 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import scipy 7 | from PIL import Image, ImageChops 8 | from scipy import ndimage 9 | 10 | from helpers.temp import TemporaryFile 11 | 12 | 13 | def trim(im): 14 | 15 | bg = Image.new(im.mode, im.size, im.getpixel((0, 0))) 16 | diff = ImageChops.difference(im, bg) 17 | diff = ImageChops.add(diff, diff, 2.0, -100) 18 | bbox = diff.getbbox() 19 | if bbox: 20 | return im.crop(bbox) 21 | 22 | 23 | def auto_canny(image, sigma=0.33): 24 | 25 | # compute the median of the single channel pixel intensities 26 | v = np.median(image) 27 | 28 | # apply automatic Canny edge detection using the computed median 29 | lower = int(max(0, (1.0 - sigma) * v)) 30 | upper = int(min(255, (1.0 + sigma) * v)) 31 | edged = cv2.Canny(image, lower, upper) 32 | 33 | return edged 34 | 35 | 36 | def crop_white(image_path): 37 | 38 | threshold = 250 39 | 40 | while True: 41 | image_sci = scipy.misc.imread(image_path) 42 | 43 | image_g = ndimage.gaussian_filter(image_sci, 3.0) 44 | labeled, _ = ndimage.label(image_g > threshold) 45 | 46 | temp_conv = TemporaryFile(".png") 47 | 48 | plt.imsave(temp_conv.name, labeled) 49 | image_cv = cv2.imread(temp_conv.name) 50 | temp_conv.cleanup() 51 | 52 | gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY) 53 | blurred = cv2.GaussianBlur(gray, (7, 7), 0) 54 | auto = auto_canny(blurred) 55 | 56 | _, cnts, _ = cv2.findContours(auto.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 57 | screenCnt = sorted(cnts, key=cv2.contourArea, reverse=True)[0] 58 | 59 | x, y, w, h = cv2.boundingRect(screenCnt) 60 | if w * h > (image_sci.shape[0] * image_sci.shape[1]) * 0.60: 61 | temp_crop = TemporaryFile(".tif") 62 | plt.imsave(temp_crop.name, image_sci[y:y + h, x:x + w]) 63 | 64 | image_pil = Image.open(temp_crop.name) 65 | temp_crop.cleanup() 66 | output = trim(image_pil) 67 | if output is not None: 68 | temp_output = TemporaryFile(".tif") 69 | output.save(temp_output.name) 70 | return temp_output 71 | elif threshold == 200: 72 | image_pil = Image.open(image_path) 73 | output = trim(image_pil) 74 | if output is not None: 75 | temp_output = TemporaryFile(".tif") 76 | output.save(temp_output.name) 77 | return temp_output 78 | else: 79 | threshold = 200 80 | -------------------------------------------------------------------------------- /stages/region_proposal/region_grouping.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from helpers import logger 4 | 5 | LOGGER = logger.create_logger(__name__) 6 | 7 | 8 | def mean_rect(r): 9 | return (min([i[0] for i in r]), min([i[1] for i in r]), max([i[0] + i[2] for i in r]) - min([i[0] for i in r]), max([i[1] + i[3] for i in r]) - min([i[1] for i in r])) 10 | 11 | 12 | def extend_rect(r): 13 | return (min([i[0] for i in r]), min([i[1] for i in r]), max([i[0] + i[2] for i in r]) - min([i[0] for i in r]), max([i[3] for i in r])) 14 | 15 | 16 | def merge(candidates, width, height): 17 | merged_candidates = set() 18 | processed = set() 19 | 20 | threshold = int(((width + height) / 2) * (0.14)) 21 | for x, y, w, h in candidates: 22 | if (x, y, w, h) not in processed: 23 | group = set() 24 | group.add((x, y, w, h)) 25 | for x1, y1, w1, h1 in candidates: 26 | if abs(x1 - x) <= threshold and abs(y1 - y) <= threshold and abs(w1 - w) <= threshold and abs(h1 - h) <= threshold: 27 | group.add((x1, y1, w1, h1)) 28 | processed.add((x1, y1, w1, h1)) 29 | merged_candidates.add(mean_rect(group)) 30 | 31 | return merged_candidates 32 | 33 | 34 | def contains_remove(merged_candidates): 35 | refined_merged_candidates = set() 36 | for x, y, w, h in merged_candidates: 37 | is_contained = False 38 | merged_candidates_copy = set(merged_candidates) 39 | merged_candidates_copy.remove((x, y, w, h)) 40 | for x1, y1, w1, h1 in merged_candidates_copy: 41 | if x1 >= x and y1 >= y and x1 + w1 <= x + w and y1 + h1 <= y + h: 42 | is_contained = True 43 | break 44 | 45 | if not is_contained: 46 | refined_merged_candidates.add((x, y, w, h)) 47 | 48 | return refined_merged_candidates 49 | 50 | 51 | def draw_superbox(refined_merged_candidates, old_superboxes=[]): 52 | no_overlap = [] 53 | draw_superbox_candidates = [] 54 | 55 | superboxes = set() 56 | 57 | if not old_superboxes: 58 | draw_superbox_candidates = old_superboxes 59 | else: 60 | draw_superbox_candidates = refined_merged_candidates 61 | 62 | base_list = list(draw_superbox_candidates) 63 | base_set = set(draw_superbox_candidates) 64 | 65 | # (x1,y1) top-left coord, (x2,y2) bottom-right coord, (w,h) size 66 | while base_list: 67 | x1, y1, w1, h1 = base_list[0] 68 | 69 | if len(base_list) == 1: # super box 70 | superboxes.add((x1, y1, w1, h1)) 71 | 72 | base_list.remove((x1, y1, w1, h1)) 73 | 74 | overlap = set() 75 | base_set.remove((x1, y1, w1, h1)) 76 | for x2, y2, w2, h2 in base_set: 77 | a = {'x1': x1, 'y1': y1, 'x2': x1 + w1, 'y2': y1 + h1, 'w': w1, 'h': h1} 78 | b = {'x1': x2, 'y1': y2, 'x2': x2 + w2, 'y2': y2 + h2, 'w': w2, 'h': h2} 79 | 80 | # overlap between A and B 81 | area_a = a['w'] * a['h'] 82 | area_b = b['w'] * b['h'] 83 | area_intersection = np.max([0, np.min([a['x2'], b['x2']]) - np.max([a['x1'], b['x1']])]) * \ 84 | np.max([0, np.min([a['y2'], b['y2']]) - np.max([a['y1'], b['y1']])]) 85 | 86 | # area_union = area_a + area_b - area_intersection 87 | # overlap_ab = float(area_intersection) / float(area_union) 88 | 89 | overlap_a = float(area_intersection) / float(area_a) 90 | overlap_b = float(area_intersection) / float(area_b) 91 | 92 | if overlap_a >= 0.40 or overlap_b >= 0.40: 93 | overlap.add((b['x1'], b['y1'], b['w'], b['h'])) 94 | 95 | if overlap: # overlap 96 | base_set = base_set - overlap 97 | base_list = [bl for bl in base_list if bl not in overlap] 98 | overlap.add((a['x1'], a['y1'], a['w'], a['h'])) 99 | 100 | superboxes.add((min([i[0] for i in overlap]), min([i[1] for i in overlap]), max([i[0] + i[2] for i in overlap]) - 101 | min([i[0] for i in overlap]), max([i[1] + i[3] for i in overlap]) - min([i[1] for i in overlap]))) 102 | 103 | no_overlap.append(False) 104 | else: # no overlap 105 | superboxes.add((x1, y1, w1, h1)) 106 | no_overlap.append(True) 107 | 108 | if all(no_overlap): 109 | return superboxes 110 | else: 111 | draw_superbox(refined_merged_candidates, superboxes) 112 | return superboxes 113 | 114 | 115 | def extend_superbox(superboxes, width, height): 116 | extended_superboxes = set() 117 | processed = set() 118 | 119 | threshold = ((width + height) / 2) * (0.06) 120 | for x, y, w, h in superboxes: 121 | if (x, y, w, h) not in processed: 122 | group = set() 123 | 124 | group.add((x, y, w, h)) 125 | for x1, y1, w1, h1 in superboxes: 126 | if abs(y1 - y) <= threshold and abs(h1 - h) <= threshold: 127 | group.add((x1, y1, w1, h1)) 128 | processed.add((x1, y1, w1, h1)) 129 | 130 | extended_superboxes.add(extend_rect(group)) 131 | 132 | return extended_superboxes 133 | 134 | 135 | def group_candidate_regions(candidates, width, height): 136 | merged_candidates = merge(candidates, width, height) 137 | LOGGER.info(merged_candidates) 138 | refined_merged_candidates = contains_remove(merged_candidates) 139 | LOGGER.info(refined_merged_candidates) 140 | superboxes = draw_superbox(refined_merged_candidates) 141 | LOGGER.info(superboxes) 142 | extended_superboxes = extend_superbox(superboxes, width, height) 143 | LOGGER.info(extended_superboxes) 144 | 145 | return extended_superboxes 146 | -------------------------------------------------------------------------------- /stages/region_proposal/region_search.py: -------------------------------------------------------------------------------- 1 | import skimage.io 2 | import skimage.transform 3 | 4 | from helpers import logger 5 | from lib import selectivesearch 6 | 7 | LOGGER = logger.create_logger(__name__) 8 | 9 | 10 | def get_candidate_regions(image, width, height): 11 | LOGGER.info("Extracting the candidate regions ...") 12 | candidates = set() 13 | 14 | stage = 1 15 | for sc in [350, 450, 500]: 16 | for sig in [0.8]: 17 | for mins in [30, 60, 120]: 18 | img = skimage.io.imread(image.name)[:, :, :3] 19 | if not (height == len(img) and width == len(img[0])): 20 | img = skimage.transform.resize(img, (height, width)) 21 | 22 | _, regions = selectivesearch.selective_search( 23 | img, scale=sc, sigma=sig, min_size=mins) 24 | 25 | for r in regions: 26 | # excluding same rectangle (with different segments) 27 | if r['rect'] in candidates: 28 | continue 29 | 30 | # excluding regions smaller than 2000 pixels 31 | if r['size'] < 2000: # TODO: Should not be hard coded, determine from image size 32 | continue 33 | 34 | # distorted rects 35 | _, _, w, h = r['rect'] 36 | if w / h > 1.2 or h / w > 1.2: 37 | continue 38 | 39 | # rects covering entire seal image 40 | if w >= (img.shape[0] - 1) * (0.7) and h >= (img.shape[1] - 1) * (0.7): 41 | continue 42 | 43 | candidates.add(r['rect']) 44 | 45 | LOGGER.info("Stage " + str(stage) + " Complete.") 46 | stage += 1 47 | 48 | return candidates 49 | -------------------------------------------------------------------------------- /stages/symbol_classification.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import skimage.io 5 | import skimage.transform 6 | 7 | import caffe 8 | from helpers import logger 9 | 10 | LOGGER = logger.create_logger(__name__) 11 | 12 | 13 | def get_symbol_images(symbols_dir): 14 | symbols = list() 15 | symbols_list = sorted(os.listdir(symbols_dir.name), key=lambda i: int(os.path.splitext(i)[0])) 16 | for filename in symbols_list: 17 | image_path = os.path.join(symbols_dir.name, filename) 18 | symbols.append([image_path, caffe.io.load_image(image_path, color=False)]) 19 | 20 | return symbols 21 | 22 | 23 | def get_symbol_classifications(symbols): 24 | if os.environ["IS_GPU"]: 25 | caffe.set_device(0) 26 | caffe.set_mode_gpu() 27 | else: 28 | caffe.set_mode_cpu() 29 | 30 | classifier = caffe.Classifier(os.path.join(os.environ["JAR_NOJAR_MODELS_DIR"], "deploy.prototxt"), 31 | os.path.join(os.environ["JAR_NOJAR_MODELS_DIR"], "weights.caffemodel"), 32 | image_dims=[64, 64], 33 | raw_scale=255.0) 34 | 35 | LOGGER.info("Classifying " + str(len(symbols)) + " inputs.") 36 | 37 | predictions = classifier.predict([s[1] for s in symbols]) 38 | 39 | symbol_sequence = list() 40 | classes = np.array([0, 1]) 41 | 42 | for i, prediction in enumerate(predictions): 43 | idx = list((-prediction).argsort()) 44 | prediction = classes[np.array(idx)] 45 | 46 | if prediction[0] == 1: 47 | symbol_sequence.append([symbols[i], "jar"]) 48 | elif prediction[0] == 0: 49 | symbol_sequence.append([symbols[i], "no-jar"]) 50 | 51 | return symbol_sequence 52 | 53 | 54 | def process_symbols(symbols_dir): 55 | symbols = get_symbol_images(symbols_dir) 56 | symbol_sequence = get_symbol_classifications(symbols) 57 | 58 | return symbol_sequence 59 | -------------------------------------------------------------------------------- /stages/symbol_segmentation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import skimage.color 5 | import skimage.io 6 | import skimage.morphology 7 | import skimage.transform 8 | from scipy import ndimage 9 | from skimage.filters import gaussian_filter, threshold_otsu 10 | 11 | from helpers.temp import TemporaryFile, TemporaryDirectory 12 | 13 | 14 | def extend_rect(r): 15 | return (min([i[0] for i in r]), min([i[1] for i in r]), max([i[0] + i[2] for i in r]) - min([i[0] for i in r]), max([i[1] + i[3] for i in r]) - min([i[1] for i in r])) 16 | 17 | 18 | def remove_contained_regions(candidates): 19 | refined_regions = set() 20 | for x, y, w, h in candidates: 21 | candidates_complement = set(candidates) 22 | candidates_complement.remove((x, y, w, h)) 23 | is_not_contained = [] 24 | for x1, y1, w1, h1 in candidates_complement: 25 | a = {'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h, 'w': w, 'h': h} 26 | b = {'x1': x1, 'y1': y1, 'x2': x1 + w1, 'y2': y1 + h1, 'w': w1, 'h': h1} 27 | 28 | # overlap between a and b 29 | area_a = a['w'] * a['h'] 30 | area_b = b['w'] * b['h'] 31 | area_intersection = np.max([0, np.min([a['x2'], b['x2']]) - np.max([a['x1'], b['x1']])]) * \ 32 | np.max([0, np.min([a['y2'], b['y2']]) - np.max([a['y1'], b['y1']])]) 33 | 34 | area_union = area_a + area_b - area_intersection 35 | overlap_ab = float(area_intersection) / float(area_union) 36 | 37 | if overlap_ab > 0.0: 38 | if x1 <= x and y1 <= y and x1 + w1 >= x + w and y1 + h1 >= y + h: 39 | is_not_contained.append(False) 40 | else: 41 | is_not_contained.append(True) 42 | else: 43 | is_not_contained.append(True) 44 | 45 | if all(is_not_contained): 46 | refined_regions.add((x, y, w, h)) 47 | 48 | return refined_regions 49 | 50 | 51 | def draw_superbox(refined_regions, old_superboxes=[]): 52 | no_overlap = [] 53 | draw_superbox_candidates = [] 54 | 55 | superboxes = set() 56 | 57 | if not old_superboxes: 58 | draw_superbox_candidates = old_superboxes 59 | else: 60 | draw_superbox_candidates = refined_regions 61 | 62 | base_list = list(draw_superbox_candidates) 63 | base_set = set(draw_superbox_candidates) 64 | 65 | # (x1,y1) top-left coord, (x2,y2) bottom-right coord, (w,h) size 66 | while base_list: 67 | x1, y1, w1, h1 = base_list[0] 68 | 69 | if len(base_list) == 1: # super box 70 | superboxes.add((x1, y1, w1, h1)) 71 | 72 | base_list.remove((x1, y1, w1, h1)) 73 | 74 | overlap = set() 75 | base_set.remove((x1, y1, w1, h1)) 76 | for x2, y2, w2, h2 in base_set: 77 | a = {'x1': x1, 'y1': y1, 'x2': x1 + w1, 'y2': y1 + h1, 'w': w1, 'h': h1} 78 | b = {'x1': x2, 'y1': y2, 'x2': x2 + w2, 'y2': y2 + h2, 'w': w2, 'h': h2} 79 | 80 | # overlap between A and B 81 | area_a = a['w'] * a['h'] 82 | area_b = b['w'] * b['h'] 83 | area_intersection = np.max([0, np.min([a['x2'], b['x2']]) - np.max([a['x1'], b['x1']])]) * \ 84 | np.max([0, np.min([a['y2'], b['y2']]) - np.max([a['y1'], b['y1']])]) 85 | 86 | # area_union = area_a + area_b - area_intersection 87 | # overlap_ab = float(area_intersection) / float(area_union) 88 | 89 | overlap_a = float(area_intersection) / float(area_a) 90 | overlap_b = float(area_intersection) / float(area_b) 91 | 92 | if overlap_a >= 0.15 or overlap_b >= 0.15: 93 | overlap.add((b['x1'], b['y1'], b['w'], b['h'])) 94 | 95 | if overlap: # overlap 96 | base_set = base_set - overlap 97 | base_list = [bl for bl in base_list if bl not in overlap] 98 | overlap.add((a['x1'], a['y1'], a['w'], a['h'])) 99 | 100 | superboxes.add((min([i[0] for i in overlap]), min([i[1] for i in overlap]), max([i[0] + i[2] for i in overlap]) - 101 | min([i[0] for i in overlap]), max([i[1] + i[3] for i in overlap]) - min([i[1] for i in overlap]))) 102 | 103 | no_overlap.append(False) 104 | else: # no overlap 105 | superboxes.add((x1, y1, w1, h1)) 106 | no_overlap.append(True) 107 | 108 | if all(no_overlap): 109 | return superboxes 110 | else: 111 | draw_superbox(refined_regions, superboxes) 112 | return superboxes 113 | 114 | 115 | def extend_superbox(superboxes): 116 | extended_superboxes = set() 117 | processed = set() 118 | 119 | for x, y, w, h in superboxes: 120 | if (x, y, w, h) not in processed: 121 | group = set() 122 | 123 | group.add((x, y, w, h)) 124 | for x1, y1, w1, h1 in superboxes: 125 | if x1 >= x and (w1 + x1) <= w + x: 126 | group.add((x1, y1, w1, h1)) 127 | processed.add((x1, y1, w1, h1)) 128 | 129 | extended_superboxes.add(extend_rect(group)) 130 | 131 | return remove_contained_regions(extended_superboxes) 132 | 133 | 134 | def get_candidate_symbol_regions(image, text_regions, updated_width, updated_height): 135 | img = skimage.io.imread(image.name)[:, :, :3] 136 | if not (updated_height == len(img) and updated_width == len(img[0])): 137 | img = skimage.transform.resize(img, (updated_height, updated_width)) 138 | 139 | symbol_regions = dict() 140 | for x, y, w, h in text_regions: 141 | text_region_image = img[y: y + h, x: x + w] 142 | text_region_image_width = len(text_region_image[0]) 143 | text_region_image_height = len(text_region_image) 144 | 145 | text_region_gray_image = skimage.color.rgb2gray(text_region_image) 146 | text_region_binary_image = image <= threshold_otsu(text_region_gray_image) 147 | 148 | temp = TemporaryFile(".png") 149 | skimage.io.imsave(temp.name, text_region_binary_image) 150 | text_region_binary_image = skimage.io.imread(temp.name) 151 | 152 | text_region_blurred_image = gaussian_filter(text_region_binary_image, sigma=3.5) 153 | text_region_blobs = text_region_blurred_image > text_region_blurred_image.mean() 154 | 155 | text_region_labels = skimage.morphology.label(text_region_blobs, neighbors=4) 156 | 157 | symbol_blobs = ndimage.find_objects(text_region_labels) 158 | candidate_symbol_regions = set() 159 | 160 | for c1, c2 in symbol_blobs: 161 | if (c2.stop - c2.start) * c1.stop - c1.start > (text_region_image.shape[0] * text_region_image.shape[1]) * (0.026): 162 | if (c2.stop - c2.start) * c1.stop - c1.start < (text_region_image.shape[0] * text_region_image.shape[1]) * (0.90): 163 | candidate_symbol_regions.add( 164 | (c2.start, c1.start, c2.stop - c2.start, c1.stop - c1.start)) 165 | 166 | symbol_regions[str((x, y, w, h))] = dict() 167 | symbol_regions[str((x, y, w, h))]["image"] = text_region_image 168 | symbol_regions[str((x, y, w, h))]["regions"] = candidate_symbol_regions 169 | symbol_regions[str((x, y, w, h))]["width"] = text_region_image_width 170 | symbol_regions[str((x, y, w, h))]["height"] = text_region_image_height 171 | 172 | return symbol_regions 173 | 174 | 175 | def process_candidate_symbol_regions(symbol_regions): 176 | for text_region in symbol_regions: 177 | candidate_symbol_regions = symbol_regions[text_region]["regions"] 178 | refined_regions = remove_contained_regions(candidate_symbol_regions) 179 | superboxes = draw_superbox(refined_regions) 180 | refined_extended_superboxes = extend_superbox(superboxes) 181 | 182 | symbol_regions[text_region]["refined_regions"] = refined_extended_superboxes 183 | 184 | return symbol_regions 185 | 186 | 187 | def get_symbols(image, text_regions, updated_width, updated_height): 188 | symbol_regions = get_candidate_symbol_regions(image, text_regions, updated_width, updated_height) 189 | symbol_regions = process_candidate_symbol_regions(symbol_regions) 190 | 191 | symbols = list() 192 | for text_region in symbol_regions: 193 | for x, y, w, h in symbol_regions[text_region]["refined_regions"]: 194 | symbols.append([(x, y, w, h), symbol_regions[text_region]["image"][y: y + h, x: x + w]]) 195 | 196 | # sort the symbols according to horizontal order 197 | symbols = sorted(symbols, key=lambda x: x[0][0]) 198 | 199 | # save all the symbols in a TemporaryDirectory 200 | symbols_dir = TemporaryDirectory() 201 | 202 | for i, symbol in enumerate(symbols): 203 | skimage.io.imsave(os.path.join(symbols_dir.name, str(i) + ".jpg"), symbol[1]) 204 | 205 | return symbols_dir 206 | -------------------------------------------------------------------------------- /stages/text_region_extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpsatish95/indus-script-ocr/3bc0c2f92c7e7926dab9efb1af29d92753d20672/stages/text_region_extraction/__init__.py -------------------------------------------------------------------------------- /stages/text_region_extraction/region_classification.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import skimage.io 5 | import skimage.transform 6 | 7 | import caffe 8 | from helpers.temp import TemporaryFile 9 | from helpers import logger 10 | 11 | LOGGER = logger.create_logger(__name__) 12 | 13 | 14 | def get_region_crops(image, grouped_regions, new_width, new_height): 15 | img = skimage.io.imread(image.name)[:, :, :3] 16 | if not (new_height == len(img) and new_width == len(img[0])): 17 | img = skimage.transform.resize(img, (new_height, new_width)) 18 | 19 | region_coords = list() 20 | region_crops = list() 21 | for x, y, w, h in grouped_regions: 22 | temp = TemporaryFile(".jpg") 23 | skimage.io.imsave(temp.name, img[y: y + h, x: x + w]) 24 | region_crops.append(caffe.io.load_image(temp.name)) 25 | region_coords.append((x, y, w, h)) 26 | 27 | return region_coords, region_crops 28 | 29 | 30 | def get_predictions(region_crops): 31 | if os.environ["IS_GPU"]: 32 | caffe.set_device(0) 33 | caffe.set_mode_gpu() 34 | else: 35 | caffe.set_mode_cpu() 36 | 37 | classifier = caffe.Classifier(os.path.join(os.environ["TEXT_NOTEXT_MODELS_DIR"], "deploy.prototxt"), 38 | os.path.join(os.environ["TEXT_NOTEXT_MODELS_DIR"], "weights.caffemodel"), 39 | mean=np.array([104, 117, 123], dtype='f4'), 40 | image_dims=[224, 224], 41 | raw_scale=255.0, 42 | channel_swap=[2, 1, 0]) 43 | 44 | LOGGER.info("Classifying " + str(len(region_crops)) + " inputs.") 45 | 46 | predictions = classifier.predict(region_crops) 47 | 48 | return predictions 49 | 50 | 51 | def classify_regions(region_coords, region_crops): 52 | text_regions = set() 53 | no_text_regions = set() 54 | both_regions = set() 55 | classes = np.array([0, 1, 2]) 56 | 57 | try: 58 | predictions = get_predictions(region_crops) 59 | for i, prediction in enumerate(predictions): 60 | idx = list((-prediction).argsort()) 61 | prediction = classes[np.array(idx)] 62 | 63 | if prediction[0] == 1 or prediction[0] == 2: 64 | text_regions.add(region_coords[i]) 65 | elif prediction[0] == 0: 66 | no_text_regions.add(region_coords[i]) 67 | if prediction[0] == 2: 68 | both_regions.add(region_coords[i]) 69 | except: 70 | LOGGER.info("Failed to classify regions!") 71 | 72 | return text_regions, no_text_regions, both_regions 73 | 74 | 75 | def process_regions(image, grouped_regions, new_width, new_height): 76 | region_coords, region_crops = \ 77 | get_region_crops(image, grouped_regions, new_width, new_height) 78 | text_regions, no_text_regions, both_regions = \ 79 | classify_regions(region_coords, region_crops) 80 | 81 | return text_regions, no_text_regions, both_regions 82 | -------------------------------------------------------------------------------- /stages/text_region_extraction/text_region_formulation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def extend_text_rect(l): 5 | return (min([i[0] for i in l]), min([i[1] for i in l]), max([i[0] + i[2] for i in l]) - min([i[0] for i in l]), max([i[3] for i in l])) 6 | 7 | 8 | def refine_text_regions(text_regions, width, height): 9 | refined_text_regions = set() 10 | processed = set() 11 | 12 | threshold = ((width + height) / 2) * (0.25) 13 | for x, y, w, h in text_regions: 14 | if (x, y, w, h) not in processed: 15 | group = set() 16 | group.add((x, y, w, h)) 17 | for x1, y1, w1, h1 in text_regions: 18 | if abs(y1 - y) <= threshold and abs(h1 - h) <= threshold: 19 | group.add((x1, y1, w1, h1)) 20 | processed.add((x1, y1, w1, h1)) 21 | refined_text_regions.add(extend_text_rect(group)) 22 | 23 | return refined_text_regions 24 | 25 | 26 | def trim_text_regions(refined_text_regions, no_text_regions, both_regions): 27 | trimmed_text_regions = set() 28 | unwanted_regions = no_text_regions.union(both_regions) 29 | 30 | for x, y, w, h in refined_text_regions: 31 | a = {'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h, 'w': w, 'h': h} 32 | for x1, y1, w1, h1 in unwanted_regions: 33 | b = {'x1': x1, 'y1': y1, 'x2': x1 + w1, 'y2': y1 + h1, 'w': w1, 'h': h1} 34 | 35 | # overlap between a and b 36 | area_a = a['w'] * a['h'] 37 | area_b = b['w'] * b['h'] 38 | area_intersection = np.max([0, np.min([a['x2'], b['x2']]) - np.max([a['x1'], b['x1']])]) * \ 39 | np.max([0, np.min([a['y2'], b['y2']]) - np.max([a['y1'], b['y1']])]) 40 | 41 | area_union = area_a + area_b - area_intersection 42 | overlap_ab = float(area_intersection) / float(area_union) 43 | 44 | is_overlap = False 45 | ax1, ay1, aw, ah = a['x1'], a['y1'], a['w'], a['h'] 46 | 47 | if overlap_ab > 0.0: 48 | if a['x1'] > b['x1'] and abs(b['x1'] + b['w'] - a['x1']) < a['w'] * 0.20: # b is left to a 49 | ax1 = b['x1'] + b['w'] 50 | is_overlap = True 51 | if a['y1'] < b['y1'] and abs(a['y1'] - b['y1']) > a['h'] * 0.70: # b is bottom to a 52 | ah = a['h'] - (a['y1'] + a['h'] - b['y1']) 53 | is_overlap = True 54 | # if a['y1'] > b['y1']: # b is top to a 55 | # ay1 = b['y1'] + b['h'] 56 | # if a['x1'] < b['x1']: # b is right to a 57 | # aw = a['w'] - (a['x1'] + a['w'] - b['x1']) 58 | # if a['y1'] < b['y1']: # b is bottom to a 59 | # ah = a['h'] - (a['y1'] + a['h'] - b['y1']) 60 | # REPLACE by Cohen Sutherland algo 61 | 62 | a['x1'], a['y1'], a['w'], a['h'] = ax1, ay1, aw, ah 63 | trimmed_text_regions.add((a['x1'], a['y1'], a['w'], a['h'])) 64 | 65 | if is_overlap: 66 | break 67 | 68 | trimmed_text_regions.add((a['x1'], a['y1'], a['w'], a['h'])) 69 | 70 | return trimmed_text_regions 71 | 72 | 73 | def extend_text_regions(refined_text_regions, both_regions): 74 | extended_text_regions = set() 75 | 76 | for x, y, w, h in refined_text_regions: 77 | a = {'x1': x, 'y1': y, 'x2': x + w, 'y2': y + h, 'w': w, 'h': h} 78 | for x1, y1, w1, h1 in both_regions: 79 | b = {'x1': x1, 'y1': y1, 'x2': x1 + w1, 'y2': y1 + h1, 'w': w1, 'h': h1} 80 | 81 | # overlap between a and b 82 | area_a = a['w'] * a['h'] 83 | area_b = b['w'] * b['h'] 84 | area_intersection = np.max([0, np.min([a['x2'], b['x2']]) - np.max([a['x1'], b['x1']])]) * \ 85 | np.max([0, np.min([a['y2'], b['y2']]) - np.max([a['y1'], b['y1']])]) 86 | 87 | area_union = area_a + area_b - area_intersection 88 | overlap_ab = float(area_intersection) / float(area_union) 89 | 90 | is_overlap = False 91 | ax1, ay1, aw, ah = a['x1'], a['y1'], a['w'], a['h'] 92 | if overlap_ab > 0.0: 93 | if a['x1'] > b['x1'] and abs(b['x1'] + b['w'] - a['x1']) < a['w'] * 0.20: # b is left to a 94 | ax1 = b['x1'] 95 | aw = a['x1'] + a['w'] - b['x1'] 96 | is_overlap = True 97 | # if a['y1'] < b['y1'] and abs(a['y1'] - b['y1']) > a['h']*0.70: # b is bottom to a 98 | # ah = a['h'] - (a['y1'] + a['h'] - b['y1']) 99 | # if a['y1'] > b['y1']: # b is top to a 100 | # ay1 = b['y1'] + b['h'] 101 | if a['x1'] < b['x1']: # b is right to a 102 | aw = b['x1'] + b['w'] - a['x1'] 103 | is_overlap = True 104 | # if a['y1'] < b['y1']: # b is bottom to a 105 | # ah = a['h'] - (a['y1'] + a['h'] - b['y1']) 106 | # REPLACE by Cohen Sutherland algo 107 | 108 | a['x1'], a['y1'], a['w'], a['h'] = ax1, ay1, aw, ah 109 | extended_text_regions.add((a['x1'], a['y1'], a['w'], a['h'])) 110 | if is_overlap: 111 | break 112 | extended_text_regions.add((a['x1'], a['y1'], a['w'], a['h'])) 113 | extended_text_regions = extended_text_regions - both_regions # CHANGE this line 114 | 115 | return extended_text_regions 116 | 117 | 118 | def process_regions(text_regions, no_text_regions, both_regions, width, height): 119 | refined_text_regions = refine_text_regions(text_regions, width, height) 120 | trimmed_text_regions = trim_text_regions(refined_text_regions, no_text_regions, both_regions) 121 | extended_text_regions = extend_text_regions(refined_text_regions, both_regions) 122 | 123 | return extended_text_regions 124 | --------------------------------------------------------------------------------