├── .gitignore ├── IP ├── Bank Cheque │ ├── 1.jpg │ ├── 2.jpg │ ├── 3.jpg │ ├── bank_details_extraction.ipynb │ ├── bit.png │ ├── cheque_details_extraction.py │ ├── diff.png │ ├── intermediat_gradx.png │ ├── output.png │ ├── ref.png │ ├── template_acc.jpg │ └── test.jpg ├── Contour Detection │ ├── ContourImageDetectedForCursive.png │ ├── Inkedlicense_con_LI.jpg │ ├── contour_detect.ipynb │ ├── croppedCursive.png │ ├── cursive.jpg │ ├── cursive_processing.ipynb │ ├── license.jpg │ ├── license2.jpg │ ├── license_con.jpg │ ├── license_econ.jpg │ ├── output_2ndapproach.jpg │ ├── pan_card_contours.jpg │ ├── pan_card_dcontours.jpg │ └── pancard.jpg └── Face Detection │ ├── face_detection.py │ └── haarcascade_frontalface_default.xml ├── LICENSE ├── Preprocessing.ipynb ├── README.md ├── api ├── .vscode │ └── settings.json ├── UPLOAD_FOLDER │ └── .gitkeep ├── cheque_details_extraction.py ├── ctpn │ ├── __init__.py │ ├── demo.py │ ├── demo_pb.py │ ├── generate_pb.py │ ├── text.yml │ └── train_net.py ├── data │ ├── VOCdevkit2007 │ └── ctpn.pb ├── db.py ├── face_matching.py ├── haarcascade_frontalface_default.xml ├── lib │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── factory.py │ │ ├── imdb.py │ │ └── pascal_voc.py │ ├── fast_rcnn │ │ ├── __init__.py │ │ ├── bbox_transform.py │ │ ├── config.py │ │ ├── nms_wrapper.py │ │ ├── test.py │ │ └── train.py │ ├── networks │ │ ├── VGGnet_test.py │ │ ├── VGGnet_train.py │ │ ├── __init__.py │ │ ├── factory.py │ │ └── network.py │ ├── prepare_training_data │ │ ├── ToVoc.py │ │ └── split_label.py │ ├── roi_data_layer │ │ ├── __init__.py │ │ ├── layer.py │ │ ├── minibatch.py │ │ └── roidb.py │ ├── rpn_msr │ │ ├── __init__.py │ │ ├── anchor_target_layer_tf.py │ │ ├── generate_anchors.py │ │ └── proposal_layer_tf.py │ ├── text_connector │ │ ├── __init__.py │ │ ├── detectors.py │ │ ├── other.py │ │ ├── text_connect_cfg.py │ │ ├── text_proposal_connector.py │ │ ├── text_proposal_connector_oriented.py │ │ └── text_proposal_graph_builder.py │ └── utils │ │ ├── __init__.py │ │ ├── bbox.c │ │ ├── bbox.pyx │ │ ├── blob.py │ │ ├── boxes_grid.py │ │ ├── cython_nms.c │ │ ├── cython_nms.pyx │ │ ├── gpu_nms.c │ │ ├── gpu_nms.cpp │ │ ├── gpu_nms.hpp │ │ ├── gpu_nms.pyx │ │ ├── make.sh │ │ ├── nms_kernel.cu │ │ ├── setup.py │ │ └── timer.py ├── model │ ├── model.h5 │ └── model.json ├── outputs.txt ├── processing.py ├── ref.png ├── server.py └── templates │ ├── aadhar_template.png │ ├── index.html │ ├── license_template.jpg │ ├── pancard_template.jpg │ ├── template_acc.jpg │ └── template_ifsc.png ├── image_utils.py ├── inference.py ├── tile.jpg ├── train.py └── utils ├── dataset.py ├── image_aug.py ├── nn_block.py └── unet.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,android,reactnative,androidstudio,jupyternotebook,visualstudiocode,jupyternotebooks 3 | # Edit at https://www.gitignore.io/?templates=python,android,reactnative,androidstudio,jupyternotebook,visualstudiocode,jupyternotebooks 4 | **/UPLOAD_FOLDER/*/ 5 | **/ignore/ 6 | 7 | ### Android ### 8 | # Built application files 9 | *.apk 10 | *.ap_ 11 | *.aab 12 | 13 | # Files for the ART/Dalvik VM 14 | *.dex 15 | 16 | # Java class files 17 | *.class 18 | 19 | # Generated files 20 | bin/ 21 | gen/ 22 | out/ 23 | 24 | # Gradle files 25 | .gradle/ 26 | build/ 27 | 28 | # Local configuration file (sdk path, etc) 29 | local.properties 30 | 31 | # Proguard folder generated by Eclipse 32 | proguard/ 33 | 34 | # Log Files 35 | *.log 36 | 37 | # Android Studio Navigation editor temp files 38 | .navigation/ 39 | 40 | # Android Studio captures folder 41 | captures/ 42 | 43 | # IntelliJ 44 | *.iml 45 | .idea/workspace.xml 46 | .idea/tasks.xml 47 | .idea/gradle.xml 48 | .idea/assetWizardSettings.xml 49 | .idea/dictionaries 50 | .idea/libraries 51 | .idea/caches 52 | 53 | # Keystore files 54 | # Uncomment the following lines if you do not want to check your keystore files in. 55 | #*.jks 56 | #*.keystore 57 | 58 | # External native build folder generated in Android Studio 2.2 and later 59 | .externalNativeBuild 60 | 61 | # Google Services (e.g. APIs or Firebase) 62 | google-services.json 63 | 64 | # Freeline 65 | freeline.py 66 | freeline/ 67 | freeline_project_description.json 68 | 69 | # fastlane 70 | fastlane/report.xml 71 | fastlane/Preview.html 72 | fastlane/screenshots 73 | fastlane/test_output 74 | fastlane/readme.md 75 | 76 | ### Android Patch ### 77 | gen-external-apklibs 78 | 79 | ### AndroidStudio ### 80 | # Covers files to be ignored for android development using Android Studio. 81 | 82 | # Built application files 83 | 84 | # Files for the ART/Dalvik VM 85 | 86 | # Java class files 87 | 88 | # Generated files 89 | 90 | # Gradle files 91 | .gradle 92 | 93 | # Signing files 94 | .signing/ 95 | 96 | # Local configuration file (sdk path, etc) 97 | 98 | # Proguard folder generated by Eclipse 99 | 100 | # Log Files 101 | 102 | # Android Studio 103 | /*/build/ 104 | /*/local.properties 105 | /*/out 106 | /*/*/build 107 | /*/*/production 108 | *.ipr 109 | *~ 110 | *.swp 111 | 112 | # Android Patch 113 | 114 | # External native build folder generated in Android Studio 2.2 and later 115 | 116 | # NDK 117 | obj/ 118 | 119 | # IntelliJ IDEA 120 | *.iws 121 | /out/ 122 | 123 | # User-specific configurations 124 | .idea/caches/ 125 | .idea/libraries/ 126 | .idea/shelf/ 127 | .idea/.name 128 | .idea/compiler.xml 129 | .idea/copyright/profiles_settings.xml 130 | .idea/encodings.xml 131 | .idea/misc.xml 132 | .idea/modules.xml 133 | .idea/scopes/scope_settings.xml 134 | .idea/vcs.xml 135 | .idea/jsLibraryMappings.xml 136 | .idea/datasources.xml 137 | .idea/dataSources.ids 138 | .idea/sqlDataSources.xml 139 | .idea/dynamic.xml 140 | .idea/uiDesigner.xml 141 | 142 | # OS-specific files 143 | .DS_Store 144 | .DS_Store? 145 | ._* 146 | .Spotlight-V100 147 | .Trashes 148 | ehthumbs.db 149 | Thumbs.db 150 | 151 | # Legacy Eclipse project files 152 | .classpath 153 | .project 154 | .cproject 155 | .settings/ 156 | 157 | # Mobile Tools for Java (J2ME) 158 | .mtj.tmp/ 159 | 160 | # Package Files # 161 | *.war 162 | *.ear 163 | 164 | # virtual machine crash logs (Reference: http://www.java.com/en/download/help/error_hotspot.xml) 165 | hs_err_pid* 166 | 167 | ## Plugin-specific files: 168 | 169 | # mpeltonen/sbt-idea plugin 170 | .idea_modules/ 171 | 172 | # JIRA plugin 173 | atlassian-ide-plugin.xml 174 | 175 | # Mongo Explorer plugin 176 | .idea/mongoSettings.xml 177 | 178 | # Crashlytics plugin (for Android Studio and IntelliJ) 179 | com_crashlytics_export_strings.xml 180 | crashlytics.properties 181 | crashlytics-build.properties 182 | fabric.properties 183 | 184 | ### AndroidStudio Patch ### 185 | 186 | !/gradle/wrapper/gradle-wrapper.jar 187 | 188 | ### JupyterNotebook ### 189 | .ipynb_checkpoints 190 | */.ipynb_checkpoints/* 191 | 192 | # Remove previous ipynb_checkpoints 193 | # git rm -r .ipynb_checkpoints/ 194 | # 195 | 196 | ### JupyterNotebooks ### 197 | # gitignore template for Jupyter Notebooks 198 | # website: http://jupyter.org/ 199 | 200 | 201 | # Remove previous ipynb_checkpoints 202 | # git rm -r .ipynb_checkpoints/ 203 | # 204 | 205 | ### Python ### 206 | # Byte-compiled / optimized / DLL files 207 | __pycache__/ 208 | *.py[cod] 209 | *$py.class 210 | 211 | # C extensions 212 | *.so 213 | 214 | # Distribution / packaging 215 | .Python 216 | develop-eggs/ 217 | dist/ 218 | downloads/ 219 | eggs/ 220 | .eggs/ 221 | lib64/ 222 | parts/ 223 | sdist/ 224 | var/ 225 | wheels/ 226 | share/python-wheels/ 227 | *.egg-info/ 228 | .installed.cfg 229 | *.egg 230 | MANIFEST 231 | 232 | # PyInstaller 233 | # Usually these files are written by a python script from a template 234 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 235 | *.manifest 236 | *.spec 237 | 238 | # Installer logs 239 | pip-log.txt 240 | pip-delete-this-directory.txt 241 | 242 | # Unit test / coverage reports 243 | htmlcov/ 244 | .tox/ 245 | .nox/ 246 | .coverage 247 | .coverage.* 248 | .cache 249 | nosetests.xml 250 | coverage.xml 251 | *.cover 252 | .hypothesis/ 253 | .pytest_cache/ 254 | 255 | # Translations 256 | *.mo 257 | *.pot 258 | 259 | # Django stuff: 260 | local_settings.py 261 | db.sqlite3 262 | 263 | # Flask stuff: 264 | instance/ 265 | .webassets-cache 266 | 267 | # Scrapy stuff: 268 | .scrapy 269 | 270 | # Sphinx documentation 271 | docs/_build/ 272 | 273 | # PyBuilder 274 | target/ 275 | 276 | # Jupyter Notebook 277 | 278 | # IPython 279 | profile_default/ 280 | ipython_config.py 281 | 282 | # pyenv 283 | .python-version 284 | 285 | # celery beat schedule file 286 | celerybeat-schedule 287 | 288 | # SageMath parsed files 289 | *.sage.py 290 | 291 | # Environments 292 | .env 293 | .venv 294 | env/ 295 | venv/ 296 | ENV/ 297 | env.bak/ 298 | venv.bak/ 299 | 300 | # Spyder project settings 301 | .spyderproject 302 | .spyproject 303 | 304 | # Rope project settings 305 | .ropeproject 306 | 307 | # mkdocs documentation 308 | /site 309 | 310 | # mypy 311 | .mypy_cache/ 312 | .dmypy.json 313 | dmypy.json 314 | 315 | # Pyre type checker 316 | .pyre/ 317 | 318 | ### Python Patch ### 319 | .venv/ 320 | 321 | ### ReactNative ### 322 | # React Native Stack Base 323 | 324 | .expo 325 | __generated__ 326 | 327 | ### ReactNative.Xcode Stack ### 328 | # Xcode 329 | # 330 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 331 | 332 | ## User settings 333 | xcuserdata/ 334 | 335 | ## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) 336 | *.xcscmblueprint 337 | *.xccheckout 338 | 339 | ## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) 340 | DerivedData/ 341 | *.moved-aside 342 | *.pbxuser 343 | !default.pbxuser 344 | *.mode1v3 345 | !default.mode1v3 346 | *.mode2v3 347 | !default.mode2v3 348 | *.perspectivev3 349 | !default.perspectivev3 350 | 351 | ### ReactNative.Linux Stack ### 352 | 353 | # temporary files which can be created if a process still has a handle open of a deleted file 354 | .fuse_hidden* 355 | 356 | # KDE directory preferences 357 | .directory 358 | 359 | # Linux trash folder which might appear on any partition or disk 360 | .Trash-* 361 | 362 | # .nfs files are created when an open file is removed but is still being accessed 363 | .nfs* 364 | 365 | ### ReactNative.Android Stack ### 366 | # Built application files 367 | 368 | # Files for the ART/Dalvik VM 369 | 370 | # Java class files 371 | 372 | # Generated files 373 | 374 | # Gradle files 375 | 376 | # Local configuration file (sdk path, etc) 377 | 378 | # Proguard folder generated by Eclipse 379 | 380 | # Log Files 381 | 382 | # Android Studio Navigation editor temp files 383 | 384 | # Android Studio captures folder 385 | 386 | # IntelliJ 387 | 388 | # Keystore files 389 | # Uncomment the following lines if you do not want to check your keystore files in. 390 | #*.jks 391 | #*.keystore 392 | 393 | # External native build folder generated in Android Studio 2.2 and later 394 | 395 | # Google Services (e.g. APIs or Firebase) 396 | 397 | # Freeline 398 | 399 | # fastlane 400 | 401 | ### ReactNative.macOS Stack ### 402 | # General 403 | .AppleDouble 404 | .LSOverride 405 | 406 | # Icon must end with two \r 407 | Icon 408 | 409 | 410 | # Thumbnails 411 | 412 | # Files that might appear in the root of a volume 413 | .DocumentRevisions-V100 414 | .fseventsd 415 | .TemporaryItems 416 | .VolumeIcon.icns 417 | .com.apple.timemachine.donotpresent 418 | 419 | # Directories potentially created on remote AFP share 420 | .AppleDB 421 | .AppleDesktop 422 | Network Trash Folder 423 | Temporary Items 424 | .apdisk 425 | 426 | ### ReactNative.Node Stack ### 427 | # Logs 428 | logs 429 | npm-debug.log* 430 | yarn-debug.log* 431 | yarn-error.log* 432 | 433 | # Runtime data 434 | pids 435 | *.pid 436 | *.seed 437 | *.pid.lock 438 | 439 | # Directory for instrumented libs generated by jscoverage/JSCover 440 | lib-cov 441 | 442 | # Coverage directory used by tools like istanbul 443 | coverage 444 | 445 | # nyc test coverage 446 | .nyc_output 447 | 448 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 449 | .grunt 450 | 451 | # Bower dependency directory (https://bower.io/) 452 | bower_components 453 | 454 | # node-waf configuration 455 | .lock-wscript 456 | 457 | # Compiled binary addons (https://nodejs.org/api/addons.html) 458 | build/Release 459 | 460 | # Dependency directories 461 | node_modules/ 462 | jspm_packages/ 463 | 464 | # TypeScript v1 declaration files 465 | typings/ 466 | 467 | # Optional npm cache directory 468 | .npm 469 | 470 | # Optional eslint cache 471 | .eslintcache 472 | 473 | # Optional REPL history 474 | .node_repl_history 475 | 476 | # Output of 'npm pack' 477 | *.tgz 478 | 479 | # Yarn Integrity file 480 | .yarn-integrity 481 | 482 | # dotenv environment variables file 483 | .env.test 484 | 485 | # parcel-bundler cache (https://parceljs.org/) 486 | 487 | # next.js build output 488 | .next 489 | 490 | # nuxt.js build output 491 | .nuxt 492 | 493 | # vuepress build output 494 | .vuepress/dist 495 | 496 | # Serverless directories 497 | .serverless/ 498 | 499 | # FuseBox cache 500 | .fusebox/ 501 | 502 | # DynamoDB Local files 503 | .dynamodb/ 504 | 505 | ### ReactNative.Buck Stack ### 506 | buck-out/ 507 | .buckconfig.local 508 | .buckd/ 509 | .buckversion 510 | .fakebuckversion 511 | 512 | ### ReactNative.Gradle Stack ### 513 | /build/ 514 | 515 | # Ignore Gradle GUI config 516 | gradle-app.setting 517 | 518 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 519 | !gradle-wrapper.jar 520 | 521 | # Cache of project 522 | .gradletasknamecache 523 | 524 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 525 | # gradle/wrapper/gradle-wrapper.properties 526 | 527 | ### VisualStudioCode ### 528 | .vscode/* 529 | !.vscode/settings.json 530 | !.vscode/tasks.json 531 | !.vscode/launch.json 532 | !.vscode/extensions.json 533 | 534 | ### VisualStudioCode Patch ### 535 | # Ignore all local history of files 536 | .history 537 | 538 | # End of https://www.gitignore.io/api/python,android,reactnative,androidstudio,jupyternotebook,visualstudiocode,jupyternotebooks -------------------------------------------------------------------------------- /IP/Bank Cheque/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/1.jpg -------------------------------------------------------------------------------- /IP/Bank Cheque/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/2.jpg -------------------------------------------------------------------------------- /IP/Bank Cheque/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/3.jpg -------------------------------------------------------------------------------- /IP/Bank Cheque/bit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/bit.png -------------------------------------------------------------------------------- /IP/Bank Cheque/diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/diff.png -------------------------------------------------------------------------------- /IP/Bank Cheque/intermediat_gradx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/intermediat_gradx.png -------------------------------------------------------------------------------- /IP/Bank Cheque/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/output.png -------------------------------------------------------------------------------- /IP/Bank Cheque/ref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/ref.png -------------------------------------------------------------------------------- /IP/Bank Cheque/template_acc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/template_acc.jpg -------------------------------------------------------------------------------- /IP/Bank Cheque/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Bank Cheque/test.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/ContourImageDetectedForCursive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/ContourImageDetectedForCursive.png -------------------------------------------------------------------------------- /IP/Contour Detection/Inkedlicense_con_LI.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/Inkedlicense_con_LI.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/contour_detect.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import cv2\n", 10 | "import numpy as np\n", 11 | "\n", 12 | "image = cv2.imread('cursive.jpg')\n", 13 | "blurred = cv2.pyrMeanShiftFiltering(image, 81, 101)\n", 14 | "gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)\n", 15 | "_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n", 16 | "\n", 17 | "_, contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)\n", 18 | "\n", 19 | "for i in range(len(contours)):\n", 20 | " cv2.drawContours(image, contours, i, (0,255,0), 5)\n", 21 | " cv2.namedWindow('Display', cv2.WINDOW_NORMAL)\n", 22 | " cv2.imshow('Display', image)\n", 23 | " cv2.waitKey(0)\n", 24 | "\n", 25 | "cv2.destroyAllWindows()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 16, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "239.0\n", 38 | "209.5\n", 39 | "35.5\n", 40 | "245.0\n", 41 | "287.5\n", 42 | "2318.5\n", 43 | "21.5\n", 44 | "21.5\n", 45 | "89.0\n", 46 | "42.5\n", 47 | "43.0\n", 48 | "125.0\n", 49 | "866.5\n", 50 | "203.0\n", 51 | "120.5\n", 52 | "182.0\n", 53 | "1288.5\n", 54 | "567.5\n", 55 | "878.0\n", 56 | "977.0\n", 57 | "370.5\n", 58 | "389.5\n", 59 | "60.5\n", 60 | "96.5\n", 61 | "122.5\n", 62 | "27.0\n", 63 | "203.5\n", 64 | "236.0\n", 65 | "203.5\n", 66 | "200.5\n", 67 | "40.0\n", 68 | "2369.0\n", 69 | "2682.5\n", 70 | "104.5\n", 71 | "111.0\n", 72 | "244.5\n", 73 | "246.5\n", 74 | "34.5\n", 75 | "35.5\n", 76 | "192.5\n", 77 | "1354.0\n", 78 | "2424.0\n", 79 | "21.5\n", 80 | "4.0\n", 81 | "1275.5\n", 82 | "140.5\n", 83 | "74.5\n", 84 | "236.0\n", 85 | "238.0\n", 86 | "47.0\n", 87 | "21.0\n", 88 | "45.5\n", 89 | "983.0\n", 90 | "42.0\n", 91 | "21.0\n", 92 | "2982.5\n", 93 | "1710.5\n", 94 | "2018.0\n", 95 | "183141.0\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "import cv2\n", 101 | "import numpy as np\n", 102 | "\n", 103 | "image = cv2.imread('cursive.jpg')\n", 104 | "image = cv2.resize(image, (image.shape[0]*2, image.shape[1]))\n", 105 | "blurred = cv2.pyrMeanShiftFiltering(image, 31, 71)\n", 106 | "gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)\n", 107 | "_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n", 108 | "\n", 109 | "_, contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)\n", 110 | "\n", 111 | "for c in contours:\n", 112 | " rect = cv2.boundingRect(c)\n", 113 | "# if rect[2] < 5 or rect[3] < 5: continue\n", 114 | " print(cv2.contourArea(c))\n", 115 | " x, y, w, h = rect\n", 116 | " cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)\n", 117 | "\n", 118 | "cv2.imshow(\"Show\",image)\n", 119 | "cv2.waitKey() \n", 120 | "cv2.destroyAllWindows()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.6" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /IP/Contour Detection/croppedCursive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/croppedCursive.png -------------------------------------------------------------------------------- /IP/Contour Detection/cursive.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/cursive.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/license.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/license.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/license2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/license2.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/license_con.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/license_con.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/license_econ.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/license_econ.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/output_2ndapproach.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/output_2ndapproach.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/pan_card_contours.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/pan_card_contours.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/pan_card_dcontours.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/pan_card_dcontours.jpg -------------------------------------------------------------------------------- /IP/Contour Detection/pancard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/IP/Contour Detection/pancard.jpg -------------------------------------------------------------------------------- /IP/Face Detection/face_detection.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def get_photo(image): 5 | ''' 6 | Image Should be 1920 x 1080 pixels 7 | ''' 8 | scale_factor = 1.1 9 | min_neighbors = 3 10 | min_size = (250, 250) 11 | flags = cv2.CASCADE_SCALE_IMAGE 12 | 13 | face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml') 14 | image = cv2.imread(image) 15 | gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 16 | 17 | faces = face_cascade.detectMultiScale(image, scaleFactor = scale_factor, minNeighbors = min_neighbors, 18 | minSize = min_size, flags = flags) 19 | x, y, w, h = faces[0] 20 | face = image[y-50:y+h+40, x-10:x+w+10] 21 | 22 | return face 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Praneet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Docify 2 | Deep Learning based Flask api to extract details from Indian ID cards like Aadhar Card, PAN Card and Driving Licence. 3 | 4 | #### Tech 5 | Docify uses a number of open source projects to work properly: 6 | 7 | * [Tesseract](https://github.com/tesseract-ocr/tesseract) - Tesseract Open Source OCR Engine 8 | * [Text-Detection-CTPN](https://github.com/eragonruan/text-detection-ctpn/tree/master) - Text detection mainly based on ctpn model in tensorflow 9 | * [Python3.6](https://www.python.org) - duh 10 | 11 | # Installation 12 | #### Install Linux Dependencies 13 | ```sh 14 | $ sudo apt install cmake 15 | $ sudo apt install tesseract-ocr 16 | $ sudo apt install mongodb 17 | $ sudo apt install libsm6 libxext6 18 | $ sudo apt install supervisor 19 | $ sudo systemctl start mongo 20 | ``` 21 | #### Download Tesseract Models [ENG+HIN+MAR] 22 | ```sh 23 | https://github.com/tesseract-ocr/tessdata_best 24 | https://github.com/BigPino67/Tesseract-MICR-OCR 25 | ``` 26 | 27 | #### Install Python-Dependencies 28 | ```sh 29 | $ pip3 install opencv-python easydict flask face_recognition gunicorn tensorflow keras pytesseract dlib imutils opencv-contrib-python pymongo PyYAML scikit-image scikit-learn 30 | ``` 31 | #### Start Python Api 32 | ```sh 33 | python3 server.py 34 | ``` 35 | -------------------------------------------------------------------------------- /api/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/praneet/envs/deeplearning/bin/python" 3 | } -------------------------------------------------------------------------------- /api/UPLOAD_FOLDER/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/UPLOAD_FOLDER/.gitkeep -------------------------------------------------------------------------------- /api/cheque_details_extraction.py: -------------------------------------------------------------------------------- 1 | ## All neccessary imports ## 2 | import cv2 3 | import re 4 | import imutils 5 | import numpy as np 6 | import pytesseract as pyt 7 | from imutils import contours 8 | from skimage.segmentation import clear_border 9 | 10 | 11 | ## New MICR Method ## 12 | def get_micrcode(image_name): 13 | try: 14 | image = cv2.imread(image_name, 0) 15 | image = cv2.resize(image, (1920,1080)) 16 | 17 | (h,w,) = image.shape[:2] 18 | delta = int(h - (h*0.17)) 19 | bottom = image[delta:h, 0:w] 20 | 21 | thresh = cv2.threshold(bottom, 100, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 22 | 23 | text = pyt.image_to_string(thresh, lang='mcr', config='--oem 1 --psm 3') 24 | 25 | a, b, c, d = text.split()[:4] 26 | 27 | if len(b) > 10: 28 | b = b[0:9] 29 | b += 'a' 30 | return a + ' ' + b + ' ' + c + ' ' + d 31 | except: 32 | return 'MICR Not Found' 33 | ## New MICR End ## 34 | 35 | #### IFSC ##### 36 | def get_ifsc(image_path): 37 | 38 | def replace(text): 39 | # Remove some noise present in the text 40 | chars = "`*_{}[]()>#+-.!$:;?" 41 | for c in chars: 42 | text = text.replace(c, '') 43 | return text 44 | 45 | # Read image 46 | image = cv2.imread(image_path) 47 | image = cv2.resize(image, (1920,1080)) 48 | lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) 49 | luminance, a, b = cv2.split(lab) 50 | 51 | hist,bins = np.histogram(luminance,256,[0,256]) 52 | 53 | mean = int((np.argmax(hist) + np.argmin(hist)) / 2) 54 | 55 | luminance[luminance > mean] = 255 56 | luminance[luminance <= mean] = 0 57 | 58 | # Forward it to ocr to get all the text present in image 59 | text = pyt.image_to_string(luminance, config=('--oem 1 --psm 3')) 60 | 61 | # Find IFSC in text and find the IFSC Code using regex 62 | ifsc = text.find('IFSC') 63 | # Select the range where the real IFSC Code will be present 64 | text = text[ifsc: ifsc + 30] 65 | 66 | text = replace(text) 67 | try: 68 | text = re.findall(r'[A-Z0-9]{11}', text)[0] 69 | except: 70 | return 0 71 | return text 72 | 73 | def get_ifsc2(image_path): 74 | 75 | def replace(text): 76 | # Remove some noise present in the text 77 | chars = "`*_{}[]()>#+-.!$:;?" 78 | for c in chars: 79 | text = text.replace(c, '') 80 | return text 81 | 82 | # Read image 83 | image = cv2.imread(image_path) 84 | image = cv2.resize(image, (1920,1080)) 85 | gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 86 | lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) 87 | luminance, a, b = cv2.split(lab) 88 | 89 | hist,bins = np.histogram(luminance,256,[0,256]) 90 | 91 | mean = int((np.argmax(hist) + np.argmin(hist)) / 2) 92 | 93 | luminance[luminance > mean] = 255 94 | luminance[luminance <= mean] = 0 95 | 96 | # Read template 97 | template = cv2.imread('templates/template_ifsc.png') 98 | template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY) 99 | template_thresh = cv2.threshold(template_gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 100 | 101 | diff = cv2.subtract(luminance, template_thresh) 102 | diff = cv2.bitwise_and(diff, gray_image) 103 | # Forward it to ocr to get all the text present in image 104 | text = pyt.image_to_string(diff, config=('--oem 1 --psm 3')) 105 | 106 | # Find IFSC in text and find the IFSC Code using regex 107 | 108 | # Select the range where the real IFSC Code will be present 109 | text = replace(text) 110 | try: 111 | text = re.findall(r'[A-Z0-9]{11}', text)[0] 112 | except: 113 | return 0 114 | return text 115 | 116 | def get_ifsc3(image): 117 | 118 | def replace(text): 119 | return text.replace('?', '7') 120 | 121 | img = cv2.imread(image) 122 | text = pyt.image_to_string(img, config=('--oem 1 --psm 3')) 123 | 124 | ifsc = text.find('IFSC') 125 | new_text = text[ifsc : ifsc + 30] 126 | new_text = replace(new_text) 127 | 128 | try: 129 | code = re.findall(r'[A-Z0-9]{11}', new_text)[0] 130 | except: 131 | return 0 132 | return code 133 | 134 | def ensemble_ifsc_output(cheque_img): 135 | ifsc1 = get_ifsc(cheque_img) 136 | ifsc2 = get_ifsc2(cheque_img) 137 | ifsc3 = get_ifsc3(cheque_img) 138 | ifsc = [ifsc1, ifsc2, ifsc3] 139 | 140 | if ifsc1 == 0 and ifsc2 == 0 and ifsc3 == 0: 141 | return 'IFSC Not Found' 142 | else: 143 | for code in ifsc: 144 | if code != 0: 145 | return code 146 | return 'IFSC Not Found' 147 | 148 | #### IFSC END ##### 149 | 150 | 151 | #### Account No #### 152 | def get_acc(image_path): 153 | # Read image 154 | image = cv2.imread(image_path) 155 | image = cv2.resize(image, (1920,1080)) 156 | lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) 157 | luminance, a, b = cv2.split(lab) 158 | 159 | hist,bins = np.histogram(luminance,256,[0,256]) 160 | 161 | mean = int((np.argmax(hist) + np.argmin(hist)) / 2) 162 | 163 | luminance[luminance > mean] = 255 164 | luminance[luminance <= mean] = 0 165 | 166 | # Read template 167 | template = cv2.imread('templates/template_acc.jpg', 0) 168 | 169 | thresh = cv2.threshold(template, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 170 | 171 | # Set difference 172 | diff = cv2.subtract(luminance, template) 173 | 174 | text = pyt.image_to_string(diff, config=('--oem 1 --psm 3')) 175 | 176 | if '-' in list(text): 177 | 178 | text = text.replace('-', '') 179 | 180 | try: 181 | acc_no = re.findall(r'[0-9]{9,18}',text)[0] 182 | except: 183 | text = pyt.image_to_string(luminance, config=('--oem 1 --psm 3')) 184 | if '-' in list(text): 185 | 186 | text = text.replace('-', '') 187 | try: 188 | acc_no = re.findall(r'[0-9]{9,18}',text)[0] 189 | except: 190 | return 0 191 | return acc_no 192 | 193 | def get_acc2(cheque_img): 194 | img = cv2.imread(cheque_img) 195 | 196 | text = pyt.image_to_string(img, config=('--oem 1 --psm 3')) 197 | 198 | if '-' in list(text): 199 | text = text.replace('-', '') 200 | try: 201 | text = re.findall(r'[0-9]{9,18}', text)[0] 202 | except: 203 | return 0 204 | return text 205 | 206 | 207 | def ensemble_acc_output(cheque_img): 208 | acc1 = get_acc(cheque_img) 209 | acc2 = get_acc2(cheque_img) 210 | acc = [acc1, acc2] 211 | 212 | 213 | if acc1 == 0 and acc2 == 0: 214 | return 'Account Number Not Found' 215 | else: 216 | for no in acc: 217 | if no != 0: 218 | return no 219 | return 'Account Number Not Found' 220 | #### Account No END #### -------------------------------------------------------------------------------- /api/ctpn/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /api/ctpn/demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import cv2 4 | import glob 5 | import os 6 | import shutil 7 | import sys 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | sys.path.append(os.getcwd()) 13 | from lib.networks.factory import get_network 14 | from lib.fast_rcnn.config import cfg, cfg_from_file 15 | from lib.fast_rcnn.test import test_ctpn 16 | from lib.utils.timer import Timer 17 | from lib.text_connector.detectors import TextDetector 18 | from lib.text_connector.text_connect_cfg import Config as TextLineCfg 19 | 20 | 21 | def resize_im(im, scale, max_scale=None): 22 | f = float(scale) / min(im.shape[0], im.shape[1]) 23 | if max_scale != None and f * max(im.shape[0], im.shape[1]) > max_scale: 24 | f = float(max_scale) / max(im.shape[0], im.shape[1]) 25 | return cv2.resize(im, None, None, fx=f, fy=f, interpolation=cv2.INTER_LINEAR), f 26 | 27 | 28 | def draw_boxes(img, image_name, boxes, scale): 29 | base_name = image_name.split('/')[-1] 30 | with open('data/results/' + 'res_{}.txt'.format(base_name.split('.')[0]), 'w') as f: 31 | for box in boxes: 32 | if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3] - box[0]) < 5: 33 | continue 34 | if box[8] >= 0.9: 35 | color = (0, 255, 0) 36 | elif box[8] >= 0.8: 37 | color = (255, 0, 0) 38 | cv2.line(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2) 39 | cv2.line(img, (int(box[0]), int(box[1])), (int(box[4]), int(box[5])), color, 2) 40 | cv2.line(img, (int(box[6]), int(box[7])), (int(box[2]), int(box[3])), color, 2) 41 | cv2.line(img, (int(box[4]), int(box[5])), (int(box[6]), int(box[7])), color, 2) 42 | 43 | min_x = min(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale)) 44 | min_y = min(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale)) 45 | max_x = max(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale)) 46 | max_y = max(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale)) 47 | 48 | line = ','.join([str(min_x), str(min_y), str(max_x), str(max_y)]) + '\r\n' 49 | f.write(line) 50 | 51 | img = cv2.resize(img, None, None, fx=1.0 / scale, fy=1.0 / scale, interpolation=cv2.INTER_LINEAR) 52 | cv2.imwrite(os.path.join("data/results", base_name), img) 53 | 54 | 55 | def ctpn(sess, net, image_name): 56 | timer = Timer() 57 | timer.tic() 58 | 59 | img = cv2.imread(image_name) 60 | img, scale = resize_im(img, scale=TextLineCfg.SCALE, max_scale=TextLineCfg.MAX_SCALE) 61 | scores, boxes = test_ctpn(sess, net, img) 62 | 63 | textdetector = TextDetector() 64 | boxes = textdetector.detect(boxes, scores[:, np.newaxis], img.shape[:2]) 65 | draw_boxes(img, image_name, boxes, scale) 66 | timer.toc() 67 | print(('Detection took {:.3f}s for ' 68 | '{:d} object proposals').format(timer.total_time, boxes.shape[0])) 69 | 70 | 71 | if __name__ == '__main__': 72 | if os.path.exists("data/results/"): 73 | shutil.rmtree("data/results/") 74 | os.makedirs("data/results/") 75 | 76 | cfg_from_file('ctpn/text.yml') 77 | 78 | # init session 79 | config = tf.ConfigProto(allow_soft_placement=True) 80 | sess = tf.Session(config=config) 81 | # load network 82 | net = get_network("VGGnet_test") 83 | # load model 84 | print(('Loading network {:s}... '.format("VGGnet_test")), end=' ') 85 | saver = tf.train.Saver() 86 | 87 | try: 88 | ckpt = tf.train.get_checkpoint_state(cfg.TEST.checkpoints_path) 89 | print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ') 90 | saver.restore(sess, ckpt.model_checkpoint_path) 91 | print('done') 92 | except: 93 | raise 'Check your pretrained {:s}'.format(ckpt.model_checkpoint_path) 94 | 95 | im = 128 * np.ones((300, 300, 3), dtype=np.uint8) 96 | for i in range(2): 97 | _, _ = test_ctpn(sess, net, im) 98 | 99 | im_names = glob.glob(os.path.join(cfg.DATA_DIR, 'demo', '*.png')) + \ 100 | glob.glob(os.path.join(cfg.DATA_DIR, 'demo', '*.jpg')) 101 | 102 | for im_name in im_names: 103 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 104 | print(('Demo for {:s}'.format(im_name))) 105 | ctpn(sess, net, im_name) 106 | -------------------------------------------------------------------------------- /api/ctpn/demo_pb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import glob 4 | import os 5 | import shutil 6 | import sys 7 | 8 | import cv2 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | sys.path.append(os.getcwd()) 13 | from lib.fast_rcnn.config import cfg, cfg_from_file 14 | from lib.fast_rcnn.test import _get_blobs 15 | from lib.text_connector.detectors import TextDetector 16 | from lib.text_connector.text_connect_cfg import Config as TextLineCfg 17 | from lib.rpn_msr.proposal_layer_tf import proposal_layer 18 | 19 | 20 | cfg_from_file('ctpn/text.yml') 21 | 22 | # init session 23 | config = tf.ConfigProto(allow_soft_placement=True) 24 | sess = tf.Session(config=config) 25 | with open('data/ctpn.pb', 'rb') as f: 26 | graph_def = tf.GraphDef() 27 | graph_def.ParseFromString(f.read()) 28 | sess.graph.as_default() 29 | tf.import_graph_def(graph_def, name='') 30 | sess.run(tf.global_variables_initializer()) 31 | 32 | input_img = sess.graph.get_tensor_by_name('Placeholder:0') 33 | output_cls_prob = sess.graph.get_tensor_by_name('Reshape_2:0') 34 | output_box_pred = sess.graph.get_tensor_by_name('rpn_bbox_pred/Reshape_1:0') 35 | 36 | textdetector = TextDetector() 37 | 38 | 39 | def resize_im(im, scale, max_scale=None): 40 | f = float(scale) / min(im.shape[0], im.shape[1]) 41 | if max_scale != None and f * max(im.shape[0], im.shape[1]) > max_scale: 42 | f = float(max_scale) / max(im.shape[0], im.shape[1]) 43 | return cv2.resize(im, None, None, fx=f, fy=f, interpolation=cv2.INTER_LINEAR), f 44 | 45 | 46 | def draw_boxes(img, image_name, boxes, scale): 47 | # base_name = image_name.split('/')[-1] 48 | all_boxes = [] 49 | for box in boxes: 50 | if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3] - box[0]) < 5: 51 | continue 52 | # if box[8] >= 0.9: 53 | # color = (0, 255, 0) 54 | # elif box[8] >= 0.8: 55 | # color = (255, 0, 0) 56 | # cv2.line(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2) 57 | # cv2.line(img, (int(box[0]), int(box[1])), (int(box[4]), int(box[5])), color, 2) 58 | # cv2.line(img, (int(box[6]), int(box[7])), (int(box[2]), int(box[3])), color, 2) 59 | # cv2.line(img, (int(box[4]), int(box[5])), (int(box[6]), int(box[7])), color, 2) 60 | 61 | min_x = min(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale)) 62 | min_y = min(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale)) 63 | max_x = max(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale)) 64 | max_y = max(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale)) 65 | 66 | all_boxes.append([min_x, min_y, max_x, max_y]) 67 | 68 | return all_boxes 69 | 70 | def get_coords(image_name): 71 | 72 | img = cv2.imread(image_name) 73 | img, scale = resize_im(img, scale=TextLineCfg.SCALE, max_scale=TextLineCfg.MAX_SCALE) 74 | blobs, im_scales = _get_blobs(img, None) 75 | if cfg.TEST.HAS_RPN: 76 | im_blob = blobs['data'] 77 | blobs['im_info'] = np.array( 78 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 79 | dtype=np.float32) 80 | cls_prob, box_pred = sess.run([output_cls_prob, output_box_pred], feed_dict={input_img: blobs['data']}) 81 | rois, _ = proposal_layer(cls_prob, box_pred, blobs['im_info'], 'TEST', anchor_scales=cfg.ANCHOR_SCALES) 82 | 83 | scores = rois[:, 0] 84 | boxes = rois[:, 1:5] / im_scales[0] 85 | # textdetector = TextDetector() 86 | boxes = textdetector.detect(boxes, scores[:, np.newaxis], img.shape[:2]) 87 | all_coords = draw_boxes(img, image_name, boxes, scale) 88 | return all_coords -------------------------------------------------------------------------------- /api/ctpn/generate_pb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | 6 | import tensorflow as tf 7 | from tensorflow.python.framework.graph_util import convert_variables_to_constants 8 | 9 | sys.path.append(os.getcwd()) 10 | from lib.networks.factory import get_network 11 | from lib.fast_rcnn.config import cfg, cfg_from_file 12 | 13 | if __name__ == "__main__": 14 | cfg_from_file('ctpn/text.yml') 15 | 16 | config = tf.ConfigProto(allow_soft_placement=True) 17 | sess = tf.Session(config=config) 18 | net = get_network("VGGnet_test") 19 | print(('Loading network {:s}... '.format("VGGnet_test")), end=' ') 20 | saver = tf.train.Saver() 21 | try: 22 | ckpt = tf.train.get_checkpoint_state(cfg.TEST.checkpoints_path) 23 | print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ') 24 | saver.restore(sess, ckpt.model_checkpoint_path) 25 | print('done') 26 | except: 27 | raise 'Check your pretrained {:s}'.format(ckpt.model_checkpoint_path) 28 | print(' done.') 29 | 30 | print('all nodes are:\n') 31 | graph = tf.get_default_graph() 32 | input_graph_def = graph.as_graph_def() 33 | node_names = [node.name for node in input_graph_def.node] 34 | for x in node_names: 35 | print(x) 36 | output_node_names = 'Reshape_2,rpn_bbox_pred/Reshape_1' 37 | output_graph_def = convert_variables_to_constants(sess, input_graph_def, output_node_names.split(',')) 38 | output_graph = 'data/ctpn.pb' 39 | with tf.gfile.GFile(output_graph, 'wb') as f: 40 | f.write(output_graph_def.SerializeToString()) 41 | sess.close() 42 | -------------------------------------------------------------------------------- /api/ctpn/text.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: ctpn_end2end 2 | LOG_DIR: ctpn 3 | IS_MULTISCALE: False 4 | NET_NAME: VGGnet 5 | ANCHOR_SCALES: [16] 6 | NCLASSES: 2 7 | USE_GPU_NMS: True 8 | TRAIN: 9 | restore: 0 10 | max_steps: 50000 11 | SOLVER: Adam 12 | OHEM: False 13 | RPN_BATCHSIZE: 300 14 | BATCH_SIZE: 300 15 | LOG_IMAGE_ITERS: 100 16 | DISPLAY: 10 17 | SNAPSHOT_ITERS: 1000 18 | HAS_RPN: True 19 | LEARNING_RATE: 0.00001 20 | MOMENTUM: 0.9 21 | GAMMA: 0.1 22 | STEPSIZE: 30000 23 | IMS_PER_BATCH: 1 24 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 25 | RPN_POSITIVE_OVERLAP: 0.7 26 | PROPOSAL_METHOD: gt 27 | BG_THRESH_LO: 0.0 28 | PRECLUDE_HARD_SAMPLES: True 29 | BBOX_INSIDE_WEIGHTS: [0, 1, 0, 1] 30 | RPN_BBOX_INSIDE_WEIGHTS: [0, 1, 0, 1] 31 | RPN_POSITIVE_WEIGHT: -1.0 32 | FG_FRACTION: 0.3 33 | WEIGHT_DECAY: 0.0005 34 | TEST: 35 | HAS_RPN: True 36 | DETECT_MODE: H 37 | checkpoints_path: checkpoints/ 38 | # checkpoints_path: output/ctpn_end2end/voc_2007_trainval 39 | -------------------------------------------------------------------------------- /api/ctpn/train_net.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import pprint 3 | import sys 4 | 5 | sys.path.append(os.getcwd()) 6 | from lib.fast_rcnn.train import get_training_roidb, train_net 7 | from lib.fast_rcnn.config import cfg_from_file, get_output_dir, get_log_dir 8 | from lib.datasets.factory import get_imdb 9 | from lib.networks.factory import get_network 10 | from lib.fast_rcnn.config import cfg 11 | 12 | if __name__ == '__main__': 13 | cfg_from_file('ctpn/text.yml') 14 | print('Using config:') 15 | pprint.pprint(cfg) 16 | imdb = get_imdb('voc_2007_trainval') 17 | print('Loaded dataset `{:s}` for training'.format(imdb.name)) 18 | roidb = get_training_roidb(imdb) 19 | 20 | output_dir = get_output_dir(imdb, None) 21 | log_dir = get_log_dir(imdb) 22 | print('Output will be saved to `{:s}`'.format(output_dir)) 23 | print('Logs will be saved to `{:s}`'.format(log_dir)) 24 | 25 | device_name = '/gpu:0' 26 | print(device_name) 27 | 28 | network = get_network('VGGnet_train') 29 | 30 | train_net(network, imdb, roidb, 31 | output_dir=output_dir, 32 | log_dir=log_dir, 33 | pretrained_model='data/pretrain/VGG_imagenet.npy', 34 | max_iters=int(cfg.TRAIN.max_steps), 35 | restore=bool(int(cfg.TRAIN.restore))) 36 | -------------------------------------------------------------------------------- /api/data/VOCdevkit2007: -------------------------------------------------------------------------------- 1 | /media/D/code/OCR/CTPN_LSTM/data/VOCdevkit -------------------------------------------------------------------------------- /api/data/ctpn.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/data/ctpn.pb -------------------------------------------------------------------------------- /api/db.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | def getConnection(): 4 | client = MongoClient('localhost:27017') 5 | return client 6 | 7 | def getDB(client): 8 | db = client.DocScanner 9 | return db 10 | 11 | def getCollection(collection_name, db): 12 | collection = db[collection_name] 13 | return collection 14 | 15 | def closeConnection(client): 16 | client.close() 17 | 18 | ##Database Helper functions 19 | def insert_data(collection, args_dict): 20 | client = getConnection() 21 | db = getDB(client) 22 | collection_name = getCollection(collection, db) 23 | ''' 24 | db_name -> string i.e name of the db 25 | args_dict -> a dictionary of entries in db 26 | ''' 27 | collection_name.insert_one(args_dict) 28 | 29 | closeConnection(client) 30 | 31 | def read_data(collection): 32 | client = getConnection() 33 | db = getDB(client) 34 | collection_name = getCollection(collection, db) 35 | ''' 36 | returns a cursor of objects 37 | which can be iterated and printed 38 | ''' 39 | cols = collection_name.find({}) 40 | closeConnection(client) 41 | return cols 42 | 43 | #Update in data base 44 | def update_data(collection, idno, updation): 45 | client = getConnection() 46 | db = getDB(client) 47 | collection_name = getCollection(collection, db) 48 | ''' 49 | db_name -> string 50 | idno -> id number of database entry in dict 51 | ''' 52 | collection_name.update_one(idno, updation) 53 | closeConnection(client) 54 | 55 | def delete_row(collection, idno): 56 | client = getConnection() 57 | db = getDB(client) 58 | collection_name = getCollection(collection, db) 59 | ''' 60 | Deletes the complete row 61 | idno must be a dict {idno:'anything'} 62 | ''' 63 | collection_name.delete_many(idno) 64 | closeConnection(client) -------------------------------------------------------------------------------- /api/face_matching.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import face_recognition 3 | import numpy as np 4 | 5 | def detect_faces(image_path): 6 | faceDetector = cv2.CascadeClassifier('haarcascade_frontalface_default.xml') 7 | image = cv2.imread(image_path) 8 | faces = faceDetector.detectMultiScale(image, scaleFactor=1.1, minNeighbors = 3, minSize = (250,250), flags = cv2.CASCADE_SCALE_IMAGE) 9 | 10 | try: 11 | x,y,w,h = faces[0] 12 | except: 13 | return 'No Face Found' 14 | face = image[y-50:y+h+40, x-10:x+w+10] 15 | return face 16 | 17 | def match_faces(id_card_image, ref_image): 18 | id_card = detect_faces(id_card_image) 19 | ref = detect_faces(ref_image) 20 | try: 21 | ref = cv2.resize(ref, (id_card.shape[1], id_card.shape[0])) 22 | 23 | id_card_encodings = face_recognition.face_encodings(id_card)[0] 24 | ref_encodings = face_recognition.face_encodings(ref)[0] 25 | 26 | result = face_recognition.compare_faces([id_card_encodings], ref_encodings)[0] 27 | percent = face_recognition.face_distance([id_card_encodings], ref_encodings)[0] 28 | percent = (1 - percent) * 100.00 29 | 30 | return result, percent 31 | except: 32 | return False, 0 -------------------------------------------------------------------------------- /api/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/lib/__init__.py -------------------------------------------------------------------------------- /api/lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .imdb import imdb 2 | from .pascal_voc import pascal_voc 3 | from . import factory 4 | 5 | -------------------------------------------------------------------------------- /api/lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | from .pascal_voc import pascal_voc 2 | __sets = {} 3 | def _selective_search_IJCV_top_k(split, year, top_k): 4 | imdb = pascal_voc(split, year) 5 | imdb.roidb_handler = imdb.selective_search_IJCV_roidb 6 | imdb.config['top_k'] = top_k 7 | return imdb 8 | # Set up voc__ using selective search "fast" mode 9 | for year in ['2007', '2012', '0712']: 10 | for split in ['train', 'val', 'trainval', 'test']: 11 | name = 'voc_{}_{}'.format(year, split) 12 | __sets[name] = (lambda split=split, year=year: 13 | pascal_voc(split, year)) 14 | 15 | def get_imdb(name): 16 | """Get an imdb (image database) by name.""" 17 | if name not in __sets: 18 | print((list_imdbs())) 19 | raise KeyError('Unknown dataset: {}'.format(name)) 20 | return __sets[name]() 21 | 22 | def list_imdbs(): 23 | """List all registered imdbs.""" 24 | return list(__sets.keys()) 25 | -------------------------------------------------------------------------------- /api/lib/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import PIL 4 | import numpy as np 5 | import scipy.sparse 6 | from lib.utils.bbox import bbox_overlaps 7 | from lib.fast_rcnn.config import cfg 8 | 9 | class imdb(object): 10 | 11 | def __init__(self, name): 12 | self._name = name 13 | self._num_classes = 0 14 | self._classes = [] 15 | self._image_index = [] 16 | self._obj_proposer = 'selective_search' 17 | self._roidb = None 18 | print(self.default_roidb) 19 | self._roidb_handler = self.default_roidb 20 | # Use this dict for storing dataset specific config options 21 | self.config = {} 22 | 23 | @property 24 | def name(self): 25 | return self._name 26 | 27 | @property 28 | def num_classes(self): 29 | return len(self._classes) 30 | 31 | @property 32 | def classes(self): 33 | return self._classes 34 | 35 | @property 36 | def image_index(self): 37 | return self._image_index 38 | 39 | @property 40 | def roidb_handler(self): 41 | return self._roidb_handler 42 | 43 | @roidb_handler.setter 44 | def roidb_handler(self, val): 45 | self._roidb_handler = val 46 | 47 | def set_proposal_method(self, method): 48 | method = eval('self.' + method + '_roidb') 49 | self.roidb_handler = method 50 | 51 | @property 52 | def roidb(self): 53 | # A roidb is a list of dictionaries, each with the following keys: 54 | # boxes 55 | # gt_overlaps 56 | # gt_classes 57 | # flipped 58 | if self._roidb is not None: 59 | return self._roidb 60 | self._roidb = self.roidb_handler() 61 | return self._roidb 62 | 63 | @property 64 | def cache_path(self): 65 | cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) 66 | if not os.path.exists(cache_path): 67 | os.makedirs(cache_path) 68 | return cache_path 69 | 70 | @property 71 | def num_images(self): 72 | return len(self.image_index) 73 | 74 | def image_path_at(self, i): 75 | raise NotImplementedError 76 | 77 | def default_roidb(self): 78 | raise NotImplementedError 79 | 80 | def _get_widths(self): 81 | return [PIL.Image.open(self.image_path_at(i)).size[0] 82 | for i in range(self.num_images)] 83 | 84 | def append_flipped_images(self): 85 | num_images = self.num_images 86 | widths = self._get_widths() 87 | for i in range(num_images): 88 | boxes = self.roidb[i]['boxes'].copy() 89 | oldx1 = boxes[:, 0].copy() 90 | oldx2 = boxes[:, 2].copy() 91 | boxes[:, 0] = widths[i] - oldx2 - 1 92 | boxes[:, 2] = widths[i] - oldx1 - 1 93 | for b in range(len(boxes)): 94 | if boxes[b][2]< boxes[b][0]: 95 | boxes[b][0] = 0 96 | assert (boxes[:, 2] >= boxes[:, 0]).all() 97 | entry = {'boxes' : boxes, 98 | 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 99 | 'gt_classes' : self.roidb[i]['gt_classes'], 100 | 'flipped' : True} 101 | 102 | if 'gt_ishard' in self.roidb[i] and 'dontcare_areas' in self.roidb[i]: 103 | entry['gt_ishard'] = self.roidb[i]['gt_ishard'].copy() 104 | dontcare_areas = self.roidb[i]['dontcare_areas'].copy() 105 | oldx1 = dontcare_areas[:, 0].copy() 106 | oldx2 = dontcare_areas[:, 2].copy() 107 | dontcare_areas[:, 0] = widths[i] - oldx2 - 1 108 | dontcare_areas[:, 2] = widths[i] - oldx1 - 1 109 | entry['dontcare_areas'] = dontcare_areas 110 | 111 | self.roidb.append(entry) 112 | 113 | self._image_index = self._image_index * 2 114 | 115 | 116 | def create_roidb_from_box_list(self, box_list, gt_roidb): 117 | assert len(box_list) == self.num_images, \ 118 | 'Number of boxes must match number of ground-truth images' 119 | roidb = [] 120 | for i in range(self.num_images): 121 | boxes = box_list[i] 122 | num_boxes = boxes.shape[0] 123 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 124 | 125 | if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: 126 | gt_boxes = gt_roidb[i]['boxes'] 127 | gt_classes = gt_roidb[i]['gt_classes'] 128 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 129 | gt_boxes.astype(np.float)) 130 | argmaxes = gt_overlaps.argmax(axis=1) 131 | maxes = gt_overlaps.max(axis=1) 132 | I = np.where(maxes > 0)[0] 133 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 134 | 135 | overlaps = scipy.sparse.csr_matrix(overlaps) 136 | roidb.append({ 137 | 'boxes' : boxes, 138 | 'gt_classes' : np.zeros((num_boxes,), dtype=np.int32), 139 | 'gt_overlaps' : overlaps, 140 | 'flipped' : False, 141 | 'seg_areas' : np.zeros((num_boxes,), dtype=np.float32), 142 | }) 143 | return roidb 144 | 145 | @staticmethod 146 | def merge_roidbs(a, b): 147 | assert len(a) == len(b) 148 | for i in range(len(a)): 149 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 150 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 151 | b[i]['gt_classes'])) 152 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 153 | b[i]['gt_overlaps']]) 154 | a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'], 155 | b[i]['seg_areas'])) 156 | return a 157 | 158 | -------------------------------------------------------------------------------- /api/lib/datasets/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import numpy as np 4 | import scipy.sparse 5 | try: 6 | import cPickle as pickle 7 | except: 8 | import pickle 9 | import uuid 10 | import xml.etree.ElementTree as ET 11 | from .imdb import imdb 12 | from lib.fast_rcnn.config import cfg 13 | 14 | class pascal_voc(imdb): 15 | def __init__(self, image_set, year, devkit_path=None): 16 | imdb.__init__(self, 'voc_' + year + '_' + image_set) 17 | self._year = year 18 | self._image_set = image_set 19 | self._devkit_path = self._get_default_path() if devkit_path is None \ 20 | else devkit_path 21 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) 22 | self._classes = ('__background__', # always index 0 23 | 'text') 24 | 25 | self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes))))) 26 | self._image_ext = '.jpg' 27 | self._image_index = self._load_image_set_index() 28 | # Default to roidb handler 29 | #self._roidb_handler = self.selective_search_roidb 30 | self._roidb_handler = self.gt_roidb 31 | self._salt = str(uuid.uuid4()) 32 | self._comp_id = 'comp4' 33 | 34 | # PASCAL specific config options 35 | self.config = {'cleanup' : True, 36 | 'use_salt' : True, 37 | 'use_diff' : False, 38 | 'matlab_eval' : False, 39 | 'rpn_file' : None, 40 | 'min_size' : 2} 41 | 42 | assert os.path.exists(self._devkit_path), \ 43 | 'VOCdevkit path does not exist: {}'.format(self._devkit_path) 44 | assert os.path.exists(self._data_path), \ 45 | 'Path does not exist: {}'.format(self._data_path) 46 | 47 | def image_path_at(self, i): 48 | """ 49 | Return the absolute path to image i in the image sequence. 50 | """ 51 | return self.image_path_from_index(self._image_index[i]) 52 | 53 | def image_path_from_index(self, index): 54 | """ 55 | Construct an image path from the image's "index" identifier. 56 | """ 57 | image_path = os.path.join(self._data_path, 'JPEGImages', 58 | index + self._image_ext) 59 | assert os.path.exists(image_path), \ 60 | 'Path does not exist: {}'.format(image_path) 61 | return image_path 62 | 63 | def _load_image_set_index(self): 64 | """ 65 | Load the indexes listed in this dataset's image set file. 66 | """ 67 | # Example path to image set file: 68 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt 69 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 70 | self._image_set + '.txt') 71 | assert os.path.exists(image_set_file), \ 72 | 'Path does not exist: {}'.format(image_set_file) 73 | with open(image_set_file) as f: 74 | image_index = [x.strip() for x in f.readlines()] 75 | return image_index 76 | 77 | def _get_default_path(self): 78 | """ 79 | Return the default path where PASCAL VOC is expected to be installed. 80 | """ 81 | return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year) 82 | 83 | def gt_roidb(self): 84 | """ 85 | Return the database of ground-truth regions of interest. 86 | 87 | This function loads/saves from/to a cache file to speed up future calls. 88 | """ 89 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 90 | if os.path.exists(cache_file): 91 | with open(cache_file, 'rb') as fid: 92 | roidb = pickle.load(fid) 93 | print('{} gt roidb loaded from {}'.format(self.name, cache_file)) 94 | return roidb 95 | 96 | gt_roidb = [self._load_pascal_annotation(index) 97 | for index in self.image_index] 98 | with open(cache_file, 'wb') as fid: 99 | pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) 100 | print('wrote gt roidb to {}'.format(cache_file)) 101 | 102 | return gt_roidb 103 | 104 | def rpn_roidb(self): 105 | if int(self._year) == 2007 or self._image_set != 'test': 106 | gt_roidb = self.gt_roidb() 107 | rpn_roidb = self._load_rpn_roidb(gt_roidb) 108 | roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb) 109 | else: 110 | roidb = self._load_rpn_roidb(None) 111 | 112 | return roidb 113 | 114 | def _load_rpn_roidb(self, gt_roidb): 115 | filename = self.config['rpn_file'] 116 | print('loading {}'.format(filename)) 117 | assert os.path.exists(filename), \ 118 | 'rpn data not found at: {}'.format(filename) 119 | with open(filename, 'rb') as f: 120 | box_list = pickle.load(f) 121 | return self.create_roidb_from_box_list(box_list, gt_roidb) 122 | 123 | 124 | def _load_pascal_annotation(self, index): 125 | """ 126 | Load image and bounding boxes info from XML file in the PASCAL VOC 127 | format. 128 | """ 129 | filename = os.path.join(self._data_path, 'Annotations', index + '.xml') 130 | tree = ET.parse(filename) 131 | objs = tree.findall('object') 132 | num_objs = len(objs) 133 | 134 | boxes = np.zeros((num_objs, 4), dtype=np.uint16) 135 | gt_classes = np.zeros((num_objs), dtype=np.int32) 136 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 137 | # "Seg" area for pascal is just the box area 138 | seg_areas = np.zeros((num_objs), dtype=np.float32) 139 | ishards = np.zeros((num_objs), dtype=np.int32) 140 | 141 | # Load object bounding boxes into a data frame. 142 | for ix, obj in enumerate(objs): 143 | bbox = obj.find('bndbox') 144 | # Make pixel indexes 0-based 145 | x1 = float(bbox.find('xmin').text) 146 | y1 = float(bbox.find('ymin').text) 147 | x2 = float(bbox.find('xmax').text) 148 | y2 = float(bbox.find('ymax').text) 149 | diffc = obj.find('difficult') 150 | difficult = 0 if diffc == None else int(diffc.text) 151 | ishards[ix] = difficult 152 | 153 | cls = self._class_to_ind[obj.find('name').text.lower().strip()] 154 | boxes[ix, :] = [x1, y1, x2, y2] 155 | gt_classes[ix] = cls 156 | overlaps[ix, cls] = 1.0 157 | seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) 158 | 159 | overlaps = scipy.sparse.csr_matrix(overlaps) 160 | 161 | return {'boxes' : boxes, 162 | 'gt_classes': gt_classes, 163 | 'gt_ishard': ishards, 164 | 'gt_overlaps' : overlaps, 165 | 'flipped' : False, 166 | 'seg_areas' : seg_areas} 167 | 168 | def _get_comp_id(self): 169 | comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt'] 170 | else self._comp_id) 171 | return comp_id 172 | 173 | def _get_voc_results_file_template(self): 174 | filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt' 175 | filedir = os.path.join(self._devkit_path, 'results', 'VOC' + self._year, 'Main') 176 | if not os.path.exists(filedir): 177 | os.makedirs(filedir) 178 | path = os.path.join(filedir, filename) 179 | return path 180 | 181 | def _write_voc_results_file(self, all_boxes): 182 | for cls_ind, cls in enumerate(self.classes): 183 | if cls == '__background__': 184 | continue 185 | print('Writing {} VOC results file'.format(cls)) 186 | filename = self._get_voc_results_file_template().format(cls) 187 | with open(filename, 'wt') as f: 188 | for im_ind, index in enumerate(self.image_index): 189 | dets = all_boxes[cls_ind][im_ind] 190 | if dets == []: 191 | continue 192 | # the VOCdevkit expects 1-based indices 193 | for k in range(dets.shape[0]): 194 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 195 | format(index, dets[k, -1], 196 | dets[k, 0] + 1, dets[k, 1] + 1, 197 | dets[k, 2] + 1, dets[k, 3] + 1)) 198 | 199 | 200 | if __name__ == '__main__': 201 | d = pascal_voc('trainval', '2007') 202 | res = d.roidb 203 | from IPython import embed; embed() 204 | -------------------------------------------------------------------------------- /api/lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/lib/fast_rcnn/__init__.py -------------------------------------------------------------------------------- /api/lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def bbox_transform(ex_rois, gt_rois): 4 | """ 5 | computes the distance from ground-truth boxes to the given boxes, normed by their size 6 | :param ex_rois: n * 4 numpy array, given boxes 7 | :param gt_rois: n * 4 numpy array, ground-truth boxes 8 | :return: deltas: n * 4 numpy array, ground-truth boxes 9 | """ 10 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 11 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 12 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 13 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 14 | 15 | assert np.min(ex_widths) > 0.1 and np.min(ex_heights) > 0.1, \ 16 | 'Invalid boxes found: {} {}'. \ 17 | format(ex_rois[np.argmin(ex_widths), :], ex_rois[np.argmin(ex_heights), :]) 18 | 19 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 20 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 21 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 22 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 23 | 24 | # warnings.catch_warnings() 25 | # warnings.filterwarnings('error') 26 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 27 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 28 | targets_dw = np.log(gt_widths / ex_widths) 29 | targets_dh = np.log(gt_heights / ex_heights) 30 | 31 | targets = np.vstack( 32 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 33 | 34 | return targets 35 | 36 | def bbox_transform_inv(boxes, deltas): 37 | 38 | boxes = boxes.astype(deltas.dtype, copy=False) 39 | 40 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 41 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 42 | ctr_x = boxes[:, 0] + 0.5 * widths 43 | ctr_y = boxes[:, 1] + 0.5 * heights 44 | 45 | dx = deltas[:, 0::4] 46 | dy = deltas[:, 1::4] 47 | dw = deltas[:, 2::4] 48 | dh = deltas[:, 3::4] 49 | 50 | pred_ctr_x = ctr_x[:, np.newaxis] 51 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 52 | pred_w = widths[:, np.newaxis] 53 | pred_h = np.exp(dh) * heights[:, np.newaxis] 54 | 55 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 56 | # x1 57 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 58 | # y1 59 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 60 | # x2 61 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 62 | # y2 63 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 64 | 65 | return pred_boxes 66 | 67 | def clip_boxes(boxes, im_shape): 68 | """ 69 | Clip boxes to image boundaries. 70 | """ 71 | 72 | # x1 >= 0 73 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 74 | # y1 >= 0 75 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 76 | # x2 < im_shape[1] 77 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 78 | # y2 < im_shape[0] 79 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 80 | return boxes 81 | -------------------------------------------------------------------------------- /api/lib/fast_rcnn/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import numpy as np 4 | from time import strftime, localtime 5 | from easydict import EasyDict as edict 6 | 7 | __C = edict() 8 | cfg = __C 9 | 10 | # Default GPU device id 11 | __C.GPU_ID = 0 12 | 13 | # Training options 14 | __C.IS_RPN = True 15 | __C.ANCHOR_SCALES = [16] 16 | __C.NCLASSES = 2 17 | __C.USE_GPU_NMS = True 18 | # multiscale training and testing 19 | __C.IS_MULTISCALE = False 20 | __C.IS_EXTRAPOLATING = True 21 | 22 | __C.REGION_PROPOSAL = 'RPN' 23 | 24 | __C.NET_NAME = 'VGGnet' 25 | __C.SUBCLS_NAME = 'voxel_exemplars' 26 | 27 | __C.TRAIN = edict() 28 | # Adam, Momentum, RMS 29 | __C.TRAIN.restore = 0 30 | __C.TRAIN.max_steps = 100000 31 | __C.TRAIN.SOLVER = 'Momentum' 32 | # learning rate 33 | __C.TRAIN.WEIGHT_DECAY = 0.0005 34 | __C.TRAIN.LEARNING_RATE = 0.001 35 | __C.TRAIN.MOMENTUM = 0.9 36 | __C.TRAIN.GAMMA = 0.1 37 | __C.TRAIN.STEPSIZE = 50000 38 | __C.TRAIN.DISPLAY = 10 39 | __C.TRAIN.LOG_IMAGE_ITERS = 100 40 | __C.TRAIN.OHEM = False 41 | __C.TRAIN.RANDOM_DOWNSAMPLE = False 42 | 43 | # Scales to compute real features 44 | __C.TRAIN.SCALES_BASE = (0.25, 0.5, 1.0, 2.0, 3.0) 45 | __C.TRAIN.KERNEL_SIZE = 5 46 | __C.TRAIN.ASPECTS= (1,) 47 | __C.TRAIN.SCALES = (600,) 48 | 49 | # Max pixel size of the longest side of a scaled input image 50 | __C.TRAIN.MAX_SIZE = 1000 51 | 52 | # Images to use per minibatch 53 | __C.TRAIN.IMS_PER_BATCH = 2 54 | 55 | # Minibatch size (number of regions of interest [ROIs]) 56 | __C.TRAIN.BATCH_SIZE = 128 57 | 58 | # Fraction of minibatch that is labeled foreground (i.e. class > 0) 59 | __C.TRAIN.FG_FRACTION = 0.25 60 | 61 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) 62 | __C.TRAIN.FG_THRESH = 0.5 63 | 64 | # Overlap threshold for a ROI to be considered background (class = 0 if 65 | # overlap in [LO, HI)) 66 | __C.TRAIN.BG_THRESH_HI = 0.5 67 | __C.TRAIN.BG_THRESH_LO = 0.1 68 | 69 | # Use horizontally-flipped images during training? 70 | __C.TRAIN.USE_FLIPPED = True 71 | 72 | # Train bounding-box regressors 73 | __C.TRAIN.BBOX_REG = True 74 | 75 | # Overlap required between a ROI and ground-truth box in order for that ROI to 76 | # be used as a bounding-box regression training example 77 | __C.TRAIN.BBOX_THRESH = 0.5 78 | 79 | # Iterations between snapshots 80 | __C.TRAIN.SNAPSHOT_ITERS = 5000 81 | 82 | # solver.prototxt specifies the snapshot path prefix, this adds an optional 83 | # infix to yield the path: [_]_iters_XYZ.caffemodel 84 | __C.TRAIN.SNAPSHOT_PREFIX = 'VGGnet_fast_rcnn' 85 | __C.TRAIN.SNAPSHOT_INFIX = '' 86 | 87 | # Use a prefetch thread in roi_data_layer.layer 88 | # So far I haven't found this useful; likely more engineering work is required 89 | __C.TRAIN.USE_PREFETCH = False 90 | 91 | # Normalize the targets (subtract empirical mean, divide by empirical stddev) 92 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True 93 | # Deprecated (inside weights) 94 | # used for assigning weights for each coords (x1, y1, w, h) 95 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 96 | # Normalize the targets using "precomputed" (or made up) means and stdevs 97 | # (BBOX_NORMALIZE_TARGETS must also be True) 98 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True 99 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0) 100 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2) 101 | # faster rcnn dont use pre-generated rois by selective search 102 | # __C.TRAIN.BBOX_NORMALIZE_STDS = (1, 1, 1, 1) 103 | 104 | # Train using these proposals 105 | __C.TRAIN.PROPOSAL_METHOD = 'selective_search' 106 | 107 | # Make minibatches from images that have similar aspect ratios (i.e. both 108 | # tall and thin or both short and wide) in order to avoid wasting computation 109 | # on zero-padding. 110 | __C.TRAIN.ASPECT_GROUPING = True 111 | # preclude rois intersected with dontcare areas above the value 112 | __C.TRAIN.DONTCARE_AREA_INTERSECTION_HI = 0.5 113 | __C.TRAIN.PRECLUDE_HARD_SAMPLES = True 114 | # Use RPN to detect objects 115 | __C.TRAIN.HAS_RPN = True 116 | # IOU >= thresh: positive example 117 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 118 | # IOU < thresh: negative example 119 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 120 | # If an anchor statisfied by positive and negative conditions set to negative 121 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False 122 | # Max number of foreground examples 123 | __C.TRAIN.RPN_FG_FRACTION = 0.5 124 | # Total number of examples 125 | __C.TRAIN.RPN_BATCHSIZE = 256 126 | # NMS threshold used on RPN proposals 127 | __C.TRAIN.RPN_NMS_THRESH = 0.7 128 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 129 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 130 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 131 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000 132 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 133 | __C.TRAIN.RPN_MIN_SIZE = 8 134 | # Deprecated (outside weights) 135 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 136 | # Give the positive RPN examples weight of p * 1 / {num positives} 137 | # and give negatives a weight of (1 - p) 138 | # Set to -1.0 to use uniform example weighting 139 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 140 | # __C.TRAIN.RPN_POSITIVE_WEIGHT = 0.5 141 | 142 | 143 | # 144 | # Testing options 145 | # 146 | 147 | __C.TEST = edict() 148 | __C.TEST.checkpoints_path = "checkpoints/" 149 | __C.TEST.DETECT_MODE = "H"#H/O for horizontal/oriented mode 150 | # Scales to use during testing (can list multiple scales) 151 | # Each scale is the pixel size of an image's shortest side 152 | __C.TEST.SCALES = (600,) 153 | 154 | # Max pixel size of the longest side of a scaled input image 155 | __C.TEST.MAX_SIZE = 1000 156 | 157 | # Overlap threshold used for non-maximum suppression (suppress boxes with 158 | # IoU >= this threshold) 159 | __C.TEST.NMS = 0.3 160 | 161 | # Experimental: treat the (K+1) units in the cls_score layer as linear 162 | # predictors (trained, eg, with one-vs-rest SVMs). 163 | __C.TEST.SVM = False 164 | 165 | # Test using bounding-box regressors 166 | __C.TEST.BBOX_REG = True 167 | 168 | # Propose boxes 169 | __C.TEST.HAS_RPN = True 170 | 171 | # Test using these proposals 172 | __C.TEST.PROPOSAL_METHOD = 'selective_search' 173 | 174 | ## NMS threshold used on RPN proposals 175 | __C.TEST.RPN_NMS_THRESH = 0.7 176 | ## Number of top scoring boxes to keep before apply NMS to RPN proposals 177 | #__C.TEST.RPN_PRE_NMS_TOP_N = 6000 178 | __C.TEST.RPN_PRE_NMS_TOP_N = 12000 179 | ## Number of top scoring boxes to keep after applying NMS to RPN proposals 180 | __C.TEST.RPN_POST_NMS_TOP_N = 1000 181 | #__C.TEST.RPN_POST_NMS_TOP_N = 2000 182 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 183 | __C.TEST.RPN_MIN_SIZE = 8 184 | 185 | 186 | # 187 | # MISC 188 | # 189 | 190 | # The mapping from image coordinates to feature map coordinates might cause 191 | # some boxes that are distinct in image space to become identical in feature 192 | # coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor 193 | # for identifying duplicate boxes. 194 | # 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16 195 | __C.DEDUP_BOXES = 1./16. 196 | 197 | # Pixel mean values (BGR order) as a (1, 1, 3) array 198 | # We use the same pixel mean for all networks even though it's not exactly what 199 | # they were trained with 200 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 201 | 202 | # For reproducibility 203 | #__C.RNG_SEED = 3 204 | __C.RNG_SEED = 3 205 | 206 | # A small number that's used many times 207 | __C.EPS = 1e-14 208 | 209 | # Root directory of project 210 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 211 | 212 | # Data directory 213 | __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data')) 214 | 215 | # Model directory 216 | __C.MODELS_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'models', 'pascal_voc')) 217 | 218 | # Name (or path to) the matlab executable 219 | __C.MATLAB = 'matlab' 220 | 221 | # Place outputs under an experiments directory 222 | __C.EXP_DIR = 'default' 223 | __C.LOG_DIR = 'default' 224 | 225 | # Use GPU implementation of non-maximum suppression 226 | __C.USE_GPU_NMS = True 227 | 228 | 229 | 230 | def get_output_dir(imdb, weights_filename): 231 | """Return the directory where experimental artifacts are placed. 232 | If the directory does not exist, it is created. 233 | 234 | A canonical path is built using the name from an imdb and a network 235 | (if not None). 236 | """ 237 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 238 | if weights_filename is not None: 239 | outdir = osp.join(outdir, weights_filename) 240 | if not os.path.exists(outdir): 241 | os.makedirs(outdir) 242 | return outdir 243 | 244 | def get_log_dir(imdb): 245 | """Return the directory where experimental artifacts are placed. 246 | If the directory does not exist, it is created. 247 | A canonical path is built using the name from an imdb and a network 248 | (if not None). 249 | """ 250 | log_dir = osp.abspath(\ 251 | osp.join(__C.ROOT_DIR, 'logs', __C.LOG_DIR, imdb.name, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) 252 | if not os.path.exists(log_dir): 253 | os.makedirs(log_dir) 254 | return log_dir 255 | 256 | def _merge_a_into_b(a, b): 257 | """Merge config dictionary a into config dictionary b, clobbering the 258 | options in b whenever they are also specified in a. 259 | """ 260 | if type(a) is not edict: 261 | return 262 | 263 | for k, v in a.items(): 264 | # a must specify keys that are in b 265 | if k not in b: 266 | raise KeyError('{} is not a valid config key'.format(k)) 267 | 268 | # the types must match, too 269 | old_type = type(b[k]) 270 | if old_type is not type(v): 271 | if isinstance(b[k], np.ndarray): 272 | v = np.array(v, dtype=b[k].dtype) 273 | else: 274 | raise ValueError(('Type mismatch ({} vs. {}) ' 275 | 'for config key: {}').format(type(b[k]), 276 | type(v), k)) 277 | 278 | # recursively merge dicts 279 | if type(v) is edict: 280 | try: 281 | _merge_a_into_b(a[k], b[k]) 282 | except: 283 | print(('Error under config key: {}'.format(k))) 284 | raise 285 | else: 286 | b[k] = v 287 | 288 | def cfg_from_file(filename): 289 | """Load a config file and merge it into the default options.""" 290 | import yaml 291 | with open(filename, 'r') as f: 292 | yaml_cfg = edict(yaml.load(f)) 293 | 294 | _merge_a_into_b(yaml_cfg, __C) 295 | 296 | def cfg_from_list(cfg_list): 297 | """Set config keys via list (e.g., from command line).""" 298 | from ast import literal_eval 299 | assert len(cfg_list) % 2 == 0 300 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 301 | key_list = k.split('.') 302 | d = __C 303 | for subkey in key_list[:-1]: 304 | assert subkey in d 305 | d = d[subkey] 306 | subkey = key_list[-1] 307 | assert subkey in d 308 | try: 309 | value = literal_eval(v) 310 | except: 311 | # handle the case when v is a string literal 312 | value = v 313 | assert type(value) == type(d[subkey]), \ 314 | 'type {} does not match original type {}'.format( 315 | type(value), type(d[subkey])) 316 | d[subkey] = value 317 | -------------------------------------------------------------------------------- /api/lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .config import cfg 3 | pure_python_nms = False 4 | try: 5 | from lib.utils.gpu_nms import gpu_nms 6 | from ..utils.cython_nms import nms as cython_nms 7 | except ImportError: 8 | pure_python_nms = True 9 | 10 | 11 | def nms(dets, thresh): 12 | if dets.shape[0] == 0: 13 | return [] 14 | if pure_python_nms: 15 | # print("Fall back to pure python nms") 16 | return py_cpu_nms(dets, thresh) 17 | if cfg.USE_GPU_NMS: 18 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | else: 20 | return cython_nms(dets, thresh) 21 | 22 | 23 | def py_cpu_nms(dets, thresh): 24 | x1 = dets[:, 0] 25 | y1 = dets[:, 1] 26 | x2 = dets[:, 2] 27 | y2 = dets[:, 3] 28 | scores = dets[:, 4] 29 | 30 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 31 | order = scores.argsort()[::-1] 32 | 33 | keep = [] 34 | while order.size > 0: 35 | i = order[0] 36 | keep.append(i) 37 | xx1 = np.maximum(x1[i], x1[order[1:]]) 38 | yy1 = np.maximum(y1[i], y1[order[1:]]) 39 | xx2 = np.minimum(x2[i], x2[order[1:]]) 40 | yy2 = np.minimum(y2[i], y2[order[1:]]) 41 | w = np.maximum(0.0, xx2 - xx1 + 1) 42 | h = np.maximum(0.0, yy2 - yy1 + 1) 43 | inter = w * h 44 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 45 | inds = np.where(ovr <= thresh)[0] 46 | order = order[inds + 1] 47 | return keep 48 | -------------------------------------------------------------------------------- /api/lib/fast_rcnn/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from .config import cfg 4 | from lib.utils.blob import im_list_to_blob 5 | 6 | 7 | def _get_image_blob(im): 8 | im_orig = im.astype(np.float32, copy=True) 9 | im_orig -= cfg.PIXEL_MEANS 10 | 11 | im_shape = im_orig.shape 12 | im_size_min = np.min(im_shape[0:2]) 13 | im_size_max = np.max(im_shape[0:2]) 14 | 15 | processed_ims = [] 16 | im_scale_factors = [] 17 | 18 | for target_size in cfg.TEST.SCALES: 19 | im_scale = float(target_size) / float(im_size_min) 20 | # Prevent the biggest axis from being more than MAX_SIZE 21 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 22 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 23 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 24 | interpolation=cv2.INTER_LINEAR) 25 | im_scale_factors.append(im_scale) 26 | processed_ims.append(im) 27 | 28 | # Create a blob to hold the input images 29 | blob = im_list_to_blob(processed_ims) 30 | 31 | return blob, np.array(im_scale_factors) 32 | 33 | 34 | def _get_blobs(im, rois): 35 | blobs = {'data' : None, 'rois' : None} 36 | blobs['data'], im_scale_factors = _get_image_blob(im) 37 | return blobs, im_scale_factors 38 | 39 | 40 | def test_ctpn(sess, net, im, boxes=None): 41 | blobs, im_scales = _get_blobs(im, boxes) 42 | if cfg.TEST.HAS_RPN: 43 | im_blob = blobs['data'] 44 | blobs['im_info'] = np.array( 45 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 46 | dtype=np.float32) 47 | # forward pass 48 | if cfg.TEST.HAS_RPN: 49 | feed_dict = {net.data: blobs['data'], net.im_info: blobs['im_info'], net.keep_prob: 1.0} 50 | 51 | rois = sess.run([net.get_output('rois')[0]],feed_dict=feed_dict) 52 | rois=rois[0] 53 | 54 | scores = rois[:, 0] 55 | if cfg.TEST.HAS_RPN: 56 | assert len(im_scales) == 1, "Only single-image batch implemented" 57 | boxes = rois[:, 1:5] / im_scales[0] 58 | return scores,boxes 59 | -------------------------------------------------------------------------------- /api/lib/fast_rcnn/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | from lib.roi_data_layer.layer import RoIDataLayer 6 | from lib.utils.timer import Timer 7 | from lib.roi_data_layer import roidb as rdl_roidb 8 | from lib.fast_rcnn.config import cfg 9 | 10 | _DEBUG = False 11 | 12 | class SolverWrapper(object): 13 | def __init__(self, sess, network, imdb, roidb, output_dir, logdir, pretrained_model=None): 14 | """Initialize the SolverWrapper.""" 15 | self.net = network 16 | self.imdb = imdb 17 | self.roidb = roidb 18 | self.output_dir = output_dir 19 | self.pretrained_model = pretrained_model 20 | 21 | print('Computing bounding-box regression targets...') 22 | if cfg.TRAIN.BBOX_REG: 23 | self.bbox_means, self.bbox_stds = rdl_roidb.add_bbox_regression_targets(roidb) 24 | print('done') 25 | 26 | # For checkpoint 27 | self.saver = tf.train.Saver(max_to_keep=100,write_version=tf.train.SaverDef.V2) 28 | self.writer = tf.summary.FileWriter(logdir=logdir, 29 | graph=tf.get_default_graph(), 30 | flush_secs=5) 31 | 32 | def snapshot(self, sess, iter): 33 | net = self.net 34 | if cfg.TRAIN.BBOX_REG and 'bbox_pred' in net.layers and cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 35 | # save original values 36 | with tf.variable_scope('bbox_pred', reuse=True): 37 | weights = tf.get_variable("weights") 38 | biases = tf.get_variable("biases") 39 | 40 | orig_0 = weights.eval() 41 | orig_1 = biases.eval() 42 | 43 | # scale and shift with bbox reg unnormalization; then save snapshot 44 | weights_shape = weights.get_shape().as_list() 45 | sess.run(weights.assign(orig_0 * np.tile(self.bbox_stds, (weights_shape[0],1)))) 46 | sess.run(biases.assign(orig_1 * self.bbox_stds + self.bbox_means)) 47 | 48 | if not os.path.exists(self.output_dir): 49 | os.makedirs(self.output_dir) 50 | 51 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 52 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 53 | filename = (cfg.TRAIN.SNAPSHOT_PREFIX + infix + 54 | '_iter_{:d}'.format(iter+1) + '.ckpt') 55 | filename = os.path.join(self.output_dir, filename) 56 | 57 | self.saver.save(sess, filename) 58 | print('Wrote snapshot to: {:s}'.format(filename)) 59 | 60 | if cfg.TRAIN.BBOX_REG and 'bbox_pred' in net.layers: 61 | # restore net to original state 62 | sess.run(weights.assign(orig_0)) 63 | sess.run(biases.assign(orig_1)) 64 | 65 | def build_image_summary(self): 66 | # A simple graph for write image summary 67 | 68 | log_image_data = tf.placeholder(tf.uint8, [None, None, 3]) 69 | log_image_name = tf.placeholder(tf.string) 70 | # import tensorflow.python.ops.gen_logging_ops as logging_ops 71 | from tensorflow.python.ops import gen_logging_ops 72 | from tensorflow.python.framework import ops as _ops 73 | log_image = gen_logging_ops._image_summary(log_image_name, tf.expand_dims(log_image_data, 0), max_images=1) 74 | _ops.add_to_collection(_ops.GraphKeys.SUMMARIES, log_image) 75 | # log_image = tf.summary.image(log_image_name, tf.expand_dims(log_image_data, 0), max_outputs=1) 76 | return log_image, log_image_data, log_image_name 77 | 78 | 79 | def train_model(self, sess, max_iters, restore=False): 80 | """Network training loop.""" 81 | data_layer = get_data_layer(self.roidb, self.imdb.num_classes) 82 | total_loss,model_loss, rpn_cross_entropy, rpn_loss_box=self.net.build_loss(ohem=cfg.TRAIN.OHEM) 83 | # scalar summary 84 | tf.summary.scalar('rpn_reg_loss', rpn_loss_box) 85 | tf.summary.scalar('rpn_cls_loss', rpn_cross_entropy) 86 | tf.summary.scalar('model_loss', model_loss) 87 | tf.summary.scalar('total_loss',total_loss) 88 | summary_op = tf.summary.merge_all() 89 | 90 | log_image, log_image_data, log_image_name =\ 91 | self.build_image_summary() 92 | 93 | # optimizer 94 | lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False) 95 | if cfg.TRAIN.SOLVER == 'Adam': 96 | opt = tf.train.AdamOptimizer(cfg.TRAIN.LEARNING_RATE) 97 | elif cfg.TRAIN.SOLVER == 'RMS': 98 | opt = tf.train.RMSPropOptimizer(cfg.TRAIN.LEARNING_RATE) 99 | else: 100 | # lr = tf.Variable(0.0, trainable=False) 101 | momentum = cfg.TRAIN.MOMENTUM 102 | opt = tf.train.MomentumOptimizer(lr, momentum) 103 | 104 | global_step = tf.Variable(0, trainable=False) 105 | with_clip = True 106 | if with_clip: 107 | tvars = tf.trainable_variables() 108 | grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars), 10.0) 109 | train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step) 110 | else: 111 | train_op = opt.minimize(total_loss, global_step=global_step) 112 | 113 | # intialize variables 114 | sess.run(tf.global_variables_initializer()) 115 | restore_iter = 0 116 | 117 | # load vgg16 118 | if self.pretrained_model is not None and not restore: 119 | try: 120 | print(('Loading pretrained model ' 121 | 'weights from {:s}').format(self.pretrained_model)) 122 | self.net.load(self.pretrained_model, sess, True) 123 | except: 124 | raise Exception('Check your pretrained model {:s}'.format(self.pretrained_model)) 125 | 126 | # resuming a trainer 127 | if restore: 128 | try: 129 | ckpt = tf.train.get_checkpoint_state(self.output_dir) 130 | print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ') 131 | self.saver.restore(sess, ckpt.model_checkpoint_path) 132 | stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0] 133 | restore_iter = int(stem.split('_')[-1]) 134 | sess.run(global_step.assign(restore_iter)) 135 | print('done') 136 | except: 137 | raise 'Check your pretrained {:s}'.format(ckpt.model_checkpoint_path) 138 | 139 | last_snapshot_iter = -1 140 | timer = Timer() 141 | for iter in range(restore_iter, max_iters): 142 | timer.tic() 143 | # learning rate 144 | if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0: 145 | sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA)) 146 | print(lr) 147 | 148 | # get one batch 149 | blobs = data_layer.forward() 150 | 151 | feed_dict={ 152 | self.net.data: blobs['data'], 153 | self.net.im_info: blobs['im_info'], 154 | self.net.keep_prob: 0.5, 155 | self.net.gt_boxes: blobs['gt_boxes'], 156 | self.net.gt_ishard: blobs['gt_ishard'], 157 | self.net.dontcare_areas: blobs['dontcare_areas'] 158 | } 159 | res_fetches=[] 160 | fetch_list = [total_loss,model_loss, rpn_cross_entropy, rpn_loss_box, 161 | summary_op, 162 | train_op] + res_fetches 163 | 164 | total_loss_val,model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, \ 165 | summary_str, _ = sess.run(fetches=fetch_list, feed_dict=feed_dict) 166 | 167 | self.writer.add_summary(summary=summary_str, global_step=global_step.eval()) 168 | 169 | _diff_time = timer.toc(average=False) 170 | 171 | 172 | if (iter) % (cfg.TRAIN.DISPLAY) == 0: 173 | print('iter: %d / %d, total loss: %.4f, model loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, lr: %f'%\ 174 | (iter, max_iters, total_loss_val,model_loss_val,rpn_loss_cls_val,rpn_loss_box_val,lr.eval())) 175 | print('speed: {:.3f}s / iter'.format(_diff_time)) 176 | 177 | if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0: 178 | last_snapshot_iter = iter 179 | self.snapshot(sess, iter) 180 | 181 | if last_snapshot_iter != iter: 182 | self.snapshot(sess, iter) 183 | 184 | def get_training_roidb(imdb): 185 | """Returns a roidb (Region of Interest database) for use in training.""" 186 | if cfg.TRAIN.USE_FLIPPED: 187 | print('Appending horizontally-flipped training examples...') 188 | imdb.append_flipped_images() 189 | print('done') 190 | 191 | print('Preparing training data...') 192 | if cfg.TRAIN.HAS_RPN: 193 | rdl_roidb.prepare_roidb(imdb) 194 | else: 195 | rdl_roidb.prepare_roidb(imdb) 196 | print('done') 197 | 198 | return imdb.roidb 199 | 200 | 201 | def get_data_layer(roidb, num_classes): 202 | """return a data layer.""" 203 | if cfg.TRAIN.HAS_RPN: 204 | if cfg.IS_MULTISCALE: 205 | # obsolete 206 | # layer = GtDataLayer(roidb) 207 | raise "Calling caffe modules..." 208 | else: 209 | layer = RoIDataLayer(roidb, num_classes) 210 | else: 211 | layer = RoIDataLayer(roidb, num_classes) 212 | 213 | return layer 214 | 215 | 216 | 217 | def train_net(network, imdb, roidb, output_dir, log_dir, pretrained_model=None, max_iters=40000, restore=False): 218 | """Train a Fast R-CNN network.""" 219 | 220 | config = tf.ConfigProto(allow_soft_placement=True) 221 | config.gpu_options.allocator_type = 'BFC' 222 | config.gpu_options.per_process_gpu_memory_fraction = 0.75 223 | with tf.Session(config=config) as sess: 224 | sw = SolverWrapper(sess, network, imdb, roidb, output_dir, logdir= log_dir, pretrained_model=pretrained_model) 225 | print('Solving...') 226 | sw.train_model(sess, max_iters, restore=restore) 227 | print('done solving') 228 | -------------------------------------------------------------------------------- /api/lib/networks/VGGnet_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .network import Network 3 | from lib.fast_rcnn.config import cfg 4 | 5 | 6 | class VGGnet_test(Network): 7 | def __init__(self, trainable=True): 8 | self.inputs = [] 9 | self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3]) 10 | self.im_info = tf.placeholder(tf.float32, shape=[None, 3]) 11 | self.keep_prob = tf.placeholder(tf.float32) 12 | self.layers = dict({'data': self.data, 'im_info': self.im_info}) 13 | self.trainable = trainable 14 | self.setup() 15 | 16 | def setup(self): 17 | anchor_scales = cfg.ANCHOR_SCALES 18 | _feat_stride = [16, ] 19 | 20 | (self.feed('data') 21 | .conv(3, 3, 64, 1, 1, name='conv1_1') 22 | .conv(3, 3, 64, 1, 1, name='conv1_2') 23 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') 24 | .conv(3, 3, 128, 1, 1, name='conv2_1') 25 | .conv(3, 3, 128, 1, 1, name='conv2_2') 26 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') 27 | .conv(3, 3, 256, 1, 1, name='conv3_1') 28 | .conv(3, 3, 256, 1, 1, name='conv3_2') 29 | .conv(3, 3, 256, 1, 1, name='conv3_3') 30 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool3') 31 | .conv(3, 3, 512, 1, 1, name='conv4_1') 32 | .conv(3, 3, 512, 1, 1, name='conv4_2') 33 | .conv(3, 3, 512, 1, 1, name='conv4_3') 34 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool4') 35 | .conv(3, 3, 512, 1, 1, name='conv5_1') 36 | .conv(3, 3, 512, 1, 1, name='conv5_2') 37 | .conv(3, 3, 512, 1, 1, name='conv5_3')) 38 | 39 | (self.feed('conv5_3').conv(3, 3, 512, 1, 1, name='rpn_conv/3x3')) 40 | 41 | (self.feed('rpn_conv/3x3').Bilstm(512, 128, 512, name='lstm_o')) 42 | (self.feed('lstm_o').lstm_fc(512, len(anchor_scales) * 10 * 4, name='rpn_bbox_pred')) 43 | (self.feed('lstm_o').lstm_fc(512, len(anchor_scales) * 10 * 2, name='rpn_cls_score')) 44 | 45 | # shape is (1, H, W, Ax2) -> (1, H, WxA, 2) 46 | (self.feed('rpn_cls_score') 47 | .spatial_reshape_layer(2, name='rpn_cls_score_reshape') 48 | .spatial_softmax(name='rpn_cls_prob')) 49 | 50 | # shape is (1, H, WxA, 2) -> (1, H, W, Ax2) 51 | (self.feed('rpn_cls_prob') 52 | .spatial_reshape_layer(len(anchor_scales) * 10 * 2, name='rpn_cls_prob_reshape')) 53 | 54 | (self.feed('rpn_cls_prob_reshape', 'rpn_bbox_pred', 'im_info') 55 | .proposal_layer(_feat_stride, anchor_scales, 'TEST', name='rois')) 56 | -------------------------------------------------------------------------------- /api/lib/networks/VGGnet_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import tensorflow as tf 3 | from .network import Network 4 | from lib.fast_rcnn.config import cfg 5 | 6 | class VGGnet_train(Network): 7 | def __init__(self, trainable=True): 8 | self.inputs = [] 9 | self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='data') 10 | self.im_info = tf.placeholder(tf.float32, shape=[None, 3], name='im_info') 11 | self.gt_boxes = tf.placeholder(tf.float32, shape=[None, 5], name='gt_boxes') 12 | self.gt_ishard = tf.placeholder(tf.int32, shape=[None], name='gt_ishard') 13 | self.dontcare_areas = tf.placeholder(tf.float32, shape=[None, 4], name='dontcare_areas') 14 | self.keep_prob = tf.placeholder(tf.float32) 15 | self.layers = dict({'data':self.data, 'im_info':self.im_info, 'gt_boxes':self.gt_boxes,\ 16 | 'gt_ishard': self.gt_ishard, 'dontcare_areas': self.dontcare_areas}) 17 | self.trainable = trainable 18 | self.setup() 19 | 20 | def setup(self): 21 | 22 | # n_classes = 21 23 | n_classes = cfg.NCLASSES 24 | # anchor_scales = [8, 16, 32] 25 | anchor_scales = cfg.ANCHOR_SCALES 26 | _feat_stride = [16, ] 27 | 28 | (self.feed('data') 29 | .conv(3, 3, 64, 1, 1, name='conv1_1') 30 | .conv(3, 3, 64, 1, 1, name='conv1_2') 31 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') 32 | .conv(3, 3, 128, 1, 1, name='conv2_1') 33 | .conv(3, 3, 128, 1, 1, name='conv2_2') 34 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') 35 | .conv(3, 3, 256, 1, 1, name='conv3_1') 36 | .conv(3, 3, 256, 1, 1, name='conv3_2') 37 | .conv(3, 3, 256, 1, 1, name='conv3_3') 38 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool3') 39 | .conv(3, 3, 512, 1, 1, name='conv4_1') 40 | .conv(3, 3, 512, 1, 1, name='conv4_2') 41 | .conv(3, 3, 512, 1, 1, name='conv4_3') 42 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool4') 43 | .conv(3, 3, 512, 1, 1, name='conv5_1') 44 | .conv(3, 3, 512, 1, 1, name='conv5_2') 45 | .conv(3, 3, 512, 1, 1, name='conv5_3')) 46 | #========= RPN ============ 47 | (self.feed('conv5_3') 48 | .conv(3,3,512,1,1,name='rpn_conv/3x3')) 49 | 50 | (self.feed('rpn_conv/3x3').Bilstm(512,128,512,name='lstm_o')) 51 | (self.feed('lstm_o').lstm_fc(512,len(anchor_scales) * 10 * 4, name='rpn_bbox_pred')) 52 | (self.feed('lstm_o').lstm_fc(512,len(anchor_scales) * 10 * 2,name='rpn_cls_score')) 53 | 54 | # generating training labels on the fly 55 | # output: rpn_labels(HxWxA, 2) rpn_bbox_targets(HxWxA, 4) rpn_bbox_inside_weights rpn_bbox_outside_weights 56 | # 给每个anchor上标签,并计算真值(也是delta的形式),以及内部权重和外部权重 57 | (self.feed('rpn_cls_score', 'gt_boxes', 'gt_ishard', 'dontcare_areas', 'im_info') 58 | .anchor_target_layer(_feat_stride, anchor_scales, name = 'rpn-data' )) 59 | 60 | # shape is (1, H, W, Ax2) -> (1, H, WxA, 2) 61 | # 给之前得到的score进行softmax,得到0-1之间的得分 62 | (self.feed('rpn_cls_score') 63 | .spatial_reshape_layer(2, name = 'rpn_cls_score_reshape') 64 | .spatial_softmax(name='rpn_cls_prob')) 65 | -------------------------------------------------------------------------------- /api/lib/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .VGGnet_train import VGGnet_train 2 | from .VGGnet_test import VGGnet_test 3 | from . import factory 4 | -------------------------------------------------------------------------------- /api/lib/networks/factory.py: -------------------------------------------------------------------------------- 1 | from .VGGnet_test import VGGnet_test 2 | from .VGGnet_train import VGGnet_train 3 | 4 | def get_network(name): 5 | """Get a network by name.""" 6 | if name.split('_')[0] == 'VGGnet': 7 | if name.split('_')[1] == 'test': 8 | return VGGnet_test() 9 | elif name.split('_')[1] == 'train': 10 | return VGGnet_train() 11 | else: 12 | raise KeyError('Unknown dataset: {}'.format(name)) 13 | else: 14 | raise KeyError('Unknown dataset: {}'.format(name)) 15 | -------------------------------------------------------------------------------- /api/lib/prepare_training_data/ToVoc.py: -------------------------------------------------------------------------------- 1 | from xml.dom.minidom import Document 2 | import cv2 3 | import os 4 | import glob 5 | import shutil 6 | import numpy as np 7 | 8 | def generate_xml(name, lines, img_size, class_sets, doncateothers=True): 9 | doc = Document() 10 | 11 | def append_xml_node_attr(child, parent=None, text=None): 12 | ele = doc.createElement(child) 13 | if not text is None: 14 | text_node = doc.createTextNode(text) 15 | ele.appendChild(text_node) 16 | parent = doc if parent is None else parent 17 | parent.appendChild(ele) 18 | return ele 19 | 20 | img_name = name + '.jpg' 21 | # create header 22 | annotation = append_xml_node_attr('annotation') 23 | append_xml_node_attr('folder', parent=annotation, text='text') 24 | append_xml_node_attr('filename', parent=annotation, text=img_name) 25 | source = append_xml_node_attr('source', parent=annotation) 26 | append_xml_node_attr('database', parent=source, text='coco_text_database') 27 | append_xml_node_attr('annotation', parent=source, text='text') 28 | append_xml_node_attr('image', parent=source, text='text') 29 | append_xml_node_attr('flickrid', parent=source, text='000000') 30 | owner = append_xml_node_attr('owner', parent=annotation) 31 | append_xml_node_attr('name', parent=owner, text='ms') 32 | size = append_xml_node_attr('size', annotation) 33 | append_xml_node_attr('width', size, str(img_size[1])) 34 | append_xml_node_attr('height', size, str(img_size[0])) 35 | append_xml_node_attr('depth', size, str(img_size[2])) 36 | append_xml_node_attr('segmented', parent=annotation, text='0') 37 | 38 | # create objects 39 | objs = [] 40 | for line in lines: 41 | splitted_line = line.strip().lower().split() 42 | cls = splitted_line[0].lower() 43 | if not doncateothers and cls not in class_sets: 44 | continue 45 | cls = 'dontcare' if cls not in class_sets else cls 46 | if cls == 'dontcare': 47 | continue 48 | obj = append_xml_node_attr('object', parent=annotation) 49 | occlusion = int(0) 50 | x1, y1, x2, y2 = int(float(splitted_line[1]) + 1), int(float(splitted_line[2]) + 1), \ 51 | int(float(splitted_line[3]) + 1), int(float(splitted_line[4]) + 1) 52 | truncation = float(0) 53 | difficult = 1 if _is_hard(cls, truncation, occlusion, x1, y1, x2, y2) else 0 54 | truncted = 0 if truncation < 0.5 else 1 55 | 56 | append_xml_node_attr('name', parent=obj, text=cls) 57 | append_xml_node_attr('pose', parent=obj, text='none') 58 | append_xml_node_attr('truncated', parent=obj, text=str(truncted)) 59 | append_xml_node_attr('difficult', parent=obj, text=str(int(difficult))) 60 | bb = append_xml_node_attr('bndbox', parent=obj) 61 | append_xml_node_attr('xmin', parent=bb, text=str(x1)) 62 | append_xml_node_attr('ymin', parent=bb, text=str(y1)) 63 | append_xml_node_attr('xmax', parent=bb, text=str(x2)) 64 | append_xml_node_attr('ymax', parent=bb, text=str(y2)) 65 | 66 | o = {'class': cls, 'box': np.asarray([x1, y1, x2, y2], dtype=float), \ 67 | 'truncation': truncation, 'difficult': difficult, 'occlusion': occlusion} 68 | objs.append(o) 69 | 70 | return doc, objs 71 | 72 | 73 | def _is_hard(cls, truncation, occlusion, x1, y1, x2, y2): 74 | hard = False 75 | if y2 - y1 < 25 and occlusion >= 2: 76 | hard = True 77 | return hard 78 | if occlusion >= 3: 79 | hard = True 80 | return hard 81 | if truncation > 0.8: 82 | hard = True 83 | return hard 84 | return hard 85 | 86 | 87 | def build_voc_dirs(outdir): 88 | mkdir = lambda dir: os.makedirs(dir) if not os.path.exists(dir) else None 89 | mkdir(outdir) 90 | mkdir(os.path.join(outdir, 'Annotations')) 91 | mkdir(os.path.join(outdir, 'ImageSets')) 92 | mkdir(os.path.join(outdir, 'ImageSets', 'Layout')) 93 | mkdir(os.path.join(outdir, 'ImageSets', 'Main')) 94 | mkdir(os.path.join(outdir, 'ImageSets', 'Segmentation')) 95 | mkdir(os.path.join(outdir, 'JPEGImages')) 96 | mkdir(os.path.join(outdir, 'SegmentationClass')) 97 | mkdir(os.path.join(outdir, 'SegmentationObject')) 98 | return os.path.join(outdir, 'Annotations'), os.path.join(outdir, 'JPEGImages'), os.path.join(outdir, 'ImageSets', 99 | 'Main') 100 | 101 | 102 | if __name__ == '__main__': 103 | _outdir = 'TEXTVOC/VOC2007' 104 | _draw = bool(0) 105 | _dest_label_dir, _dest_img_dir, _dest_set_dir = build_voc_dirs(_outdir) 106 | _doncateothers = bool(1) 107 | for dset in ['train']: 108 | _labeldir = 'label_tmp' 109 | _imagedir = 're_image' 110 | class_sets = ('text', 'dontcare') 111 | class_sets_dict = dict((k, i) for i, k in enumerate(class_sets)) 112 | allclasses = {} 113 | fs = [open(os.path.join(_dest_set_dir, cls + '_' + dset + '.txt'), 'w') for cls in class_sets] 114 | ftrain = open(os.path.join(_dest_set_dir, dset + '.txt'), 'w') 115 | 116 | files = glob.glob(os.path.join(_labeldir, '*.txt')) 117 | files.sort() 118 | for file in files: 119 | path, basename = os.path.split(file) 120 | stem, ext = os.path.splitext(basename) 121 | with open(file, 'r') as f: 122 | lines = f.readlines() 123 | img_file = os.path.join(_imagedir, stem + '.jpg') 124 | 125 | print(img_file) 126 | img = cv2.imread(img_file) 127 | img_size = img.shape 128 | 129 | doc, objs = generate_xml(stem, lines, img_size, class_sets=class_sets, doncateothers=_doncateothers) 130 | 131 | cv2.imwrite(os.path.join(_dest_img_dir, stem + '.jpg'), img) 132 | xmlfile = os.path.join(_dest_label_dir, stem + '.xml') 133 | with open(xmlfile, 'w') as f: 134 | f.write(doc.toprettyxml(indent=' ')) 135 | 136 | ftrain.writelines(stem + '\n') 137 | 138 | cls_in_image = set([o['class'] for o in objs]) 139 | 140 | for obj in objs: 141 | cls = obj['class'] 142 | allclasses[cls] = 0 \ 143 | if not cls in list(allclasses.keys()) else allclasses[cls] + 1 144 | 145 | for cls in cls_in_image: 146 | if cls in class_sets: 147 | fs[class_sets_dict[cls]].writelines(stem + ' 1\n') 148 | for cls in class_sets: 149 | if cls not in cls_in_image: 150 | fs[class_sets_dict[cls]].writelines(stem + ' -1\n') 151 | 152 | 153 | (f.close() for f in fs) 154 | ftrain.close() 155 | 156 | print('~~~~~~~~~~~~~~~~~~~') 157 | print(allclasses) 158 | print('~~~~~~~~~~~~~~~~~~~') 159 | shutil.copyfile(os.path.join(_dest_set_dir, 'train.txt'), os.path.join(_dest_set_dir, 'val.txt')) 160 | shutil.copyfile(os.path.join(_dest_set_dir, 'train.txt'), os.path.join(_dest_set_dir, 'trainval.txt')) 161 | for cls in class_sets: 162 | shutil.copyfile(os.path.join(_dest_set_dir, cls + '_train.txt'), 163 | os.path.join(_dest_set_dir, cls + '_trainval.txt')) 164 | shutil.copyfile(os.path.join(_dest_set_dir, cls + '_train.txt'), 165 | os.path.join(_dest_set_dir, cls + '_val.txt')) 166 | -------------------------------------------------------------------------------- /api/lib/prepare_training_data/split_label.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | import cv2 as cv 5 | 6 | path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/image' 7 | gt_path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/label' 8 | out_path = 're_image' 9 | if not os.path.exists(out_path): 10 | os.makedirs(out_path) 11 | files = os.listdir(path) 12 | files.sort() 13 | #files=files[:100] 14 | for file in files: 15 | _, basename = os.path.split(file) 16 | if basename.lower().split('.')[-1] not in ['jpg', 'png']: 17 | continue 18 | stem, ext = os.path.splitext(basename) 19 | gt_file = os.path.join(gt_path, 'gt_' + stem + '.txt') 20 | img_path = os.path.join(path, file) 21 | print(img_path) 22 | img = cv.imread(img_path) 23 | img_size = img.shape 24 | im_size_min = np.min(img_size[0:2]) 25 | im_size_max = np.max(img_size[0:2]) 26 | 27 | im_scale = float(600) / float(im_size_min) 28 | if np.round(im_scale * im_size_max) > 1200: 29 | im_scale = float(1200) / float(im_size_max) 30 | re_im = cv.resize(img, None, None, fx=im_scale, fy=im_scale, interpolation=cv.INTER_LINEAR) 31 | re_size = re_im.shape 32 | cv.imwrite(os.path.join(out_path, stem) + '.jpg', re_im) 33 | 34 | with open(gt_file, 'r') as f: 35 | lines = f.readlines() 36 | for line in lines: 37 | splitted_line = line.strip().lower().split(',') 38 | pt_x = np.zeros((4, 1)) 39 | pt_y = np.zeros((4, 1)) 40 | pt_x[0, 0] = int(float(splitted_line[0]) / img_size[1] * re_size[1]) 41 | pt_y[0, 0] = int(float(splitted_line[1]) / img_size[0] * re_size[0]) 42 | pt_x[1, 0] = int(float(splitted_line[2]) / img_size[1] * re_size[1]) 43 | pt_y[1, 0] = int(float(splitted_line[3]) / img_size[0] * re_size[0]) 44 | pt_x[2, 0] = int(float(splitted_line[4]) / img_size[1] * re_size[1]) 45 | pt_y[2, 0] = int(float(splitted_line[5]) / img_size[0] * re_size[0]) 46 | pt_x[3, 0] = int(float(splitted_line[6]) / img_size[1] * re_size[1]) 47 | pt_y[3, 0] = int(float(splitted_line[7]) / img_size[0] * re_size[0]) 48 | 49 | ind_x = np.argsort(pt_x, axis=0) 50 | pt_x = pt_x[ind_x] 51 | pt_y = pt_y[ind_x] 52 | 53 | if pt_y[0] < pt_y[1]: 54 | pt1 = (pt_x[0], pt_y[0]) 55 | pt3 = (pt_x[1], pt_y[1]) 56 | else: 57 | pt1 = (pt_x[1], pt_y[1]) 58 | pt3 = (pt_x[0], pt_y[0]) 59 | 60 | if pt_y[2] < pt_y[3]: 61 | pt2 = (pt_x[2], pt_y[2]) 62 | pt4 = (pt_x[3], pt_y[3]) 63 | else: 64 | pt2 = (pt_x[3], pt_y[3]) 65 | pt4 = (pt_x[2], pt_y[2]) 66 | 67 | xmin = int(min(pt1[0], pt2[0])) 68 | ymin = int(min(pt1[1], pt2[1])) 69 | xmax = int(max(pt2[0], pt4[0])) 70 | ymax = int(max(pt3[1], pt4[1])) 71 | 72 | if xmin < 0: 73 | xmin = 0 74 | if xmax > re_size[1] - 1: 75 | xmax = re_size[1] - 1 76 | if ymin < 0: 77 | ymin = 0 78 | if ymax > re_size[0] - 1: 79 | ymax = re_size[0] - 1 80 | 81 | width = xmax - xmin 82 | height = ymax - ymin 83 | 84 | # reimplement 85 | step = 16.0 86 | x_left = [] 87 | x_right = [] 88 | x_left.append(xmin) 89 | x_left_start = int(math.ceil(xmin / 16.0) * 16.0) 90 | if x_left_start == xmin: 91 | x_left_start = xmin + 16 92 | for i in np.arange(x_left_start, xmax, 16): 93 | x_left.append(i) 94 | x_left = np.array(x_left) 95 | 96 | x_right.append(x_left_start - 1) 97 | for i in range(1, len(x_left) - 1): 98 | x_right.append(x_left[i] + 15) 99 | x_right.append(xmax) 100 | x_right = np.array(x_right) 101 | 102 | idx = np.where(x_left == x_right) 103 | x_left = np.delete(x_left, idx, axis=0) 104 | x_right = np.delete(x_right, idx, axis=0) 105 | 106 | if not os.path.exists('label_tmp'): 107 | os.makedirs('label_tmp') 108 | with open(os.path.join('label_tmp', stem) + '.txt', 'a') as f: 109 | for i in range(len(x_left)): 110 | f.writelines("text\t") 111 | f.writelines(str(int(x_left[i]))) 112 | f.writelines("\t") 113 | f.writelines(str(int(ymin))) 114 | f.writelines("\t") 115 | f.writelines(str(int(x_right[i]))) 116 | f.writelines("\t") 117 | f.writelines(str(int(ymax))) 118 | f.writelines("\n") 119 | -------------------------------------------------------------------------------- /api/lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import roidb -------------------------------------------------------------------------------- /api/lib/roi_data_layer/layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.fast_rcnn.config import cfg 3 | from lib.roi_data_layer.minibatch import get_minibatch 4 | 5 | class RoIDataLayer(object): 6 | """Fast R-CNN data layer used for training.""" 7 | 8 | def __init__(self, roidb, num_classes): 9 | """Set the roidb to be used by this layer during training.""" 10 | self._roidb = roidb 11 | self._num_classes = num_classes 12 | self._shuffle_roidb_inds() 13 | 14 | def _shuffle_roidb_inds(self): 15 | """Randomly permute the training roidb.""" 16 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 17 | self._cur = 0 18 | 19 | def _get_next_minibatch_inds(self): 20 | """Return the roidb indices for the next minibatch.""" 21 | 22 | if cfg.TRAIN.HAS_RPN: 23 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 24 | self._shuffle_roidb_inds() 25 | 26 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 27 | self._cur += cfg.TRAIN.IMS_PER_BATCH 28 | else: 29 | # sample images 30 | db_inds = np.zeros((cfg.TRAIN.IMS_PER_BATCH), dtype=np.int32) 31 | i = 0 32 | while (i < cfg.TRAIN.IMS_PER_BATCH): 33 | ind = self._perm[self._cur] 34 | num_objs = self._roidb[ind]['boxes'].shape[0] 35 | if num_objs != 0: 36 | db_inds[i] = ind 37 | i += 1 38 | 39 | self._cur += 1 40 | if self._cur >= len(self._roidb): 41 | self._shuffle_roidb_inds() 42 | 43 | return db_inds 44 | 45 | def _get_next_minibatch(self): 46 | """Return the blobs to be used for the next minibatch. 47 | 48 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 49 | separate process and made available through self._blob_queue. 50 | """ 51 | db_inds = self._get_next_minibatch_inds() 52 | minibatch_db = [self._roidb[i] for i in db_inds] 53 | return get_minibatch(minibatch_db, self._num_classes) 54 | 55 | def forward(self): 56 | """Get blobs and copy them into this layer's top blob vector.""" 57 | blobs = self._get_next_minibatch() 58 | return blobs 59 | -------------------------------------------------------------------------------- /api/lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.random as npr 3 | import cv2 4 | import os 5 | from lib.fast_rcnn.config import cfg 6 | from lib.utils.blob import prep_im_for_blob, im_list_to_blob 7 | 8 | def get_minibatch(roidb, num_classes): 9 | """Given a roidb, construct a minibatch sampled from it.""" 10 | num_images = len(roidb) 11 | # Sample random scales to use for each image in this batch 12 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 13 | size=num_images) 14 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 15 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 16 | format(num_images, cfg.TRAIN.BATCH_SIZE) 17 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 18 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 19 | 20 | # Get the input image blob, formatted for caffe 21 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 22 | 23 | blobs = {'data': im_blob} 24 | 25 | if cfg.TRAIN.HAS_RPN: 26 | assert len(im_scales) == 1, "Single batch only" 27 | assert len(roidb) == 1, "Single batch only" 28 | # gt boxes: (x1, y1, x2, y2, cls) 29 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 30 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 31 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 32 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 33 | blobs['gt_boxes'] = gt_boxes 34 | blobs['gt_ishard'] = roidb[0]['gt_ishard'][gt_inds] \ 35 | if 'gt_ishard' in roidb[0] else np.zeros(gt_inds.size, dtype=int) 36 | # blobs['gt_ishard'] = roidb[0]['gt_ishard'][gt_inds] 37 | blobs['dontcare_areas'] = roidb[0]['dontcare_areas'] * im_scales[0] \ 38 | if 'dontcare_areas' in roidb[0] else np.zeros([0, 4], dtype=float) 39 | blobs['im_info'] = np.array( 40 | [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 41 | dtype=np.float32) 42 | blobs['im_name'] = os.path.basename(roidb[0]['image']) 43 | 44 | else: # not using RPN 45 | # Now, build the region of interest and label blobs 46 | rois_blob = np.zeros((0, 5), dtype=np.float32) 47 | labels_blob = np.zeros((0), dtype=np.float32) 48 | bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) 49 | bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) 50 | # all_overlaps = [] 51 | for im_i in range(num_images): 52 | labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \ 53 | = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, 54 | num_classes) 55 | 56 | # Add to RoIs blob 57 | rois = _project_im_rois(im_rois, im_scales[im_i]) 58 | batch_ind = im_i * np.ones((rois.shape[0], 1)) 59 | rois_blob_this_image = np.hstack((batch_ind, rois)) 60 | rois_blob = np.vstack((rois_blob, rois_blob_this_image)) 61 | 62 | # Add to labels, bbox targets, and bbox loss blobs 63 | labels_blob = np.hstack((labels_blob, labels)) 64 | bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) 65 | bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights)) 66 | # all_overlaps = np.hstack((all_overlaps, overlaps)) 67 | 68 | # For debug visualizations 69 | # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) 70 | 71 | blobs['rois'] = rois_blob 72 | blobs['labels'] = labels_blob 73 | 74 | if cfg.TRAIN.BBOX_REG: 75 | blobs['bbox_targets'] = bbox_targets_blob 76 | blobs['bbox_inside_weights'] = bbox_inside_blob 77 | blobs['bbox_outside_weights'] = \ 78 | np.array(bbox_inside_blob > 0).astype(np.float32) 79 | 80 | return blobs 81 | 82 | def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): 83 | """Generate a random sample of RoIs comprising foreground and background 84 | examples. 85 | """ 86 | # label = class RoI has max overlap with 87 | labels = roidb['max_classes'] 88 | overlaps = roidb['max_overlaps'] 89 | rois = roidb['boxes'] 90 | 91 | # Select foreground RoIs as those with >= FG_THRESH overlap 92 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 93 | # Guard against the case when an image has fewer than fg_rois_per_image 94 | # foreground RoIs 95 | fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) 96 | # Sample foreground regions without replacement 97 | if fg_inds.size > 0: 98 | fg_inds = npr.choice( 99 | fg_inds, size=fg_rois_per_this_image, replace=False) 100 | 101 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 102 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 103 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 104 | # Compute number of background RoIs to take from this image (guarding 105 | # against there being fewer than desired) 106 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 107 | bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, 108 | bg_inds.size) 109 | # Sample foreground regions without replacement 110 | if bg_inds.size > 0: 111 | bg_inds = npr.choice( 112 | bg_inds, size=bg_rois_per_this_image, replace=False) 113 | 114 | # The indices that we're selecting (both fg and bg) 115 | keep_inds = np.append(fg_inds, bg_inds) 116 | # Select sampled values from various arrays: 117 | labels = labels[keep_inds] 118 | # Clamp labels for the background RoIs to 0 119 | labels[fg_rois_per_this_image:] = 0 120 | overlaps = overlaps[keep_inds] 121 | rois = rois[keep_inds] 122 | 123 | bbox_targets, bbox_inside_weights = _get_bbox_regression_labels( 124 | roidb['bbox_targets'][keep_inds, :], num_classes) 125 | 126 | return labels, overlaps, rois, bbox_targets, bbox_inside_weights 127 | 128 | def _get_image_blob(roidb, scale_inds): 129 | """Builds an input blob from the images in the roidb at the specified 130 | scales. 131 | """ 132 | num_images = len(roidb) 133 | processed_ims = [] 134 | im_scales = [] 135 | for i in range(num_images): 136 | im = cv2.imread(roidb[i]['image']) 137 | if roidb[i]['flipped']: 138 | im = im[:, ::-1, :] 139 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 140 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 141 | cfg.TRAIN.MAX_SIZE) 142 | im_scales.append(im_scale) 143 | processed_ims.append(im) 144 | 145 | # Create a blob to hold the input images 146 | blob = im_list_to_blob(processed_ims) 147 | 148 | return blob, im_scales 149 | 150 | def _project_im_rois(im_rois, im_scale_factor): 151 | """Project image RoIs into the rescaled training image.""" 152 | rois = im_rois * im_scale_factor 153 | return rois 154 | 155 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 156 | """Bounding-box regression targets are stored in a compact form in the 157 | roidb. 158 | 159 | This function expands those targets into the 4-of-4*K representation used 160 | by the network (i.e. only one class has non-zero targets). The loss weights 161 | are similarly expanded. 162 | 163 | Returns: 164 | bbox_target_data (ndarray): N x 4K blob of regression targets 165 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 166 | """ 167 | clss = bbox_target_data[:, 0] 168 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 169 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 170 | inds = np.where(clss > 0)[0] 171 | for ind in inds: 172 | cls = clss[ind] 173 | start = 4 * cls 174 | end = start + 4 175 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 176 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 177 | return bbox_targets, bbox_inside_weights 178 | 179 | def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): 180 | """Visualize a mini-batch for debugging.""" 181 | import matplotlib.pyplot as plt 182 | for i in range(rois_blob.shape[0]): 183 | rois = rois_blob[i, :] 184 | im_ind = rois[0] 185 | roi = rois[1:] 186 | im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() 187 | im += cfg.PIXEL_MEANS 188 | im = im[:, :, (2, 1, 0)] 189 | im = im.astype(np.uint8) 190 | cls = labels_blob[i] 191 | plt.imshow(im) 192 | print('class: ', cls, ' overlap: ', overlaps[i]) 193 | plt.gca().add_patch( 194 | plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], 195 | roi[3] - roi[1], fill=False, 196 | edgecolor='r', linewidth=3) 197 | ) 198 | plt.show() 199 | -------------------------------------------------------------------------------- /api/lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import PIL 3 | from lib.fast_rcnn.config import cfg 4 | from lib.fast_rcnn.bbox_transform import bbox_transform 5 | from lib.utils.bbox import bbox_overlaps 6 | 7 | def prepare_roidb(imdb): 8 | """Enrich the imdb's roidb by adding some derived quantities that 9 | are useful for training. This function precomputes the maximum 10 | overlap, taken over ground-truth boxes, between each ROI and 11 | each ground-truth box. The class with maximum overlap is also 12 | recorded. 13 | """ 14 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 15 | for i in range(imdb.num_images)] 16 | roidb = imdb.roidb 17 | for i in range(len(imdb.image_index)): 18 | roidb[i]['image'] = imdb.image_path_at(i) 19 | roidb[i]['width'] = sizes[i][0] 20 | roidb[i]['height'] = sizes[i][1] 21 | # need gt_overlaps as a dense array for argmax 22 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 23 | # max overlap with gt over classes (columns) 24 | max_overlaps = gt_overlaps.max(axis=1) 25 | # gt class that had the max overlap 26 | max_classes = gt_overlaps.argmax(axis=1) 27 | roidb[i]['max_classes'] = max_classes 28 | roidb[i]['max_overlaps'] = max_overlaps 29 | # sanity checks 30 | # max overlap of 0 => class should be zero (background) 31 | zero_inds = np.where(max_overlaps == 0)[0] 32 | assert all(max_classes[zero_inds] == 0) 33 | # max overlap > 0 => class should not be zero (must be a fg class) 34 | nonzero_inds = np.where(max_overlaps > 0)[0] 35 | assert all(max_classes[nonzero_inds] != 0) 36 | 37 | def add_bbox_regression_targets(roidb): 38 | """ 39 | Add information needed to train bounding-box regressors. 40 | For each roi find the corresponding gt box, and compute the distance. 41 | then normalize the distance into Gaussian by minus mean and divided by std 42 | """ 43 | assert len(roidb) > 0 44 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 45 | 46 | num_images = len(roidb) 47 | # Infer number of classes from the number of columns in gt_overlaps 48 | num_classes = roidb[0]['gt_overlaps'].shape[1] 49 | for im_i in range(num_images): 50 | rois = roidb[im_i]['boxes'] 51 | max_overlaps = roidb[im_i]['max_overlaps'] 52 | max_classes = roidb[im_i]['max_classes'] 53 | roidb[im_i]['bbox_targets'] = \ 54 | _compute_targets(rois, max_overlaps, max_classes) 55 | 56 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 57 | # Use fixed / precomputed "means" and "stds" instead of empirical values 58 | means = np.tile( 59 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 60 | stds = np.tile( 61 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 62 | else: 63 | # Compute values needed for means and stds 64 | # var(x) = E(x^2) - E(x)^2 65 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 66 | sums = np.zeros((num_classes, 4)) 67 | squared_sums = np.zeros((num_classes, 4)) 68 | for im_i in range(num_images): 69 | targets = roidb[im_i]['bbox_targets'] 70 | for cls in range(1, num_classes): 71 | cls_inds = np.where(targets[:, 0] == cls)[0] 72 | if cls_inds.size > 0: 73 | class_counts[cls] += cls_inds.size 74 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 75 | squared_sums[cls, :] += \ 76 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 77 | 78 | means = sums / class_counts 79 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 80 | # too small number will cause nan error 81 | assert np.min(stds) < 0.01, \ 82 | 'Boxes std is too small, std:{}'.format(stds) 83 | 84 | print('bbox target means:') 85 | print(means) 86 | print(means[1:, :].mean(axis=0)) # ignore bg class 87 | print('bbox target stdevs:') 88 | print(stds) 89 | print(stds[1:, :].mean(axis=0)) # ignore bg class 90 | 91 | # Normalize targets 92 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 93 | print("Normalizing targets") 94 | for im_i in range(num_images): 95 | targets = roidb[im_i]['bbox_targets'] 96 | for cls in range(1, num_classes): 97 | cls_inds = np.where(targets[:, 0] == cls)[0] 98 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 99 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 100 | else: 101 | print("NOT normalizing targets") 102 | 103 | # These values will be needed for making predictions 104 | # (the predicts will need to be unnormalized and uncentered) 105 | return means.ravel(), stds.ravel() 106 | 107 | def _compute_targets(rois, overlaps, labels): 108 | """ 109 | Compute bounding-box regression targets for an image. 110 | for each roi find the corresponding gt_box, then compute the distance. 111 | """ 112 | # Indices of ground-truth ROIs 113 | gt_inds = np.where(overlaps == 1)[0] 114 | if len(gt_inds) == 0: 115 | # Bail if the image has no ground-truth ROIs 116 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 117 | # Indices of examples for which we try to make predictions 118 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 119 | 120 | # Get IoU overlap between each ex ROI and gt ROI 121 | ex_gt_overlaps = bbox_overlaps( 122 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 123 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 124 | 125 | # Find which gt ROI each ex ROI has max overlap with: 126 | # this will be the ex ROI's gt target 127 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 128 | gt_rois = rois[gt_inds[gt_assignment], :] 129 | ex_rois = rois[ex_inds, :] 130 | 131 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 132 | targets[ex_inds, 0] = labels[ex_inds] 133 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 134 | return targets 135 | -------------------------------------------------------------------------------- /api/lib/rpn_msr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/lib/rpn_msr/__init__.py -------------------------------------------------------------------------------- /api/lib/rpn_msr/generate_anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def generate_basic_anchors(sizes, base_size=16): 4 | base_anchor = np.array([0, 0, base_size - 1, base_size - 1], np.int32) 5 | anchors = np.zeros((len(sizes), 4), np.int32) 6 | index = 0 7 | for h, w in sizes: 8 | anchors[index] = scale_anchor(base_anchor, h, w) 9 | index += 1 10 | return anchors 11 | 12 | 13 | def scale_anchor(anchor, h, w): 14 | x_ctr = (anchor[0] + anchor[2]) * 0.5 15 | y_ctr = (anchor[1] + anchor[3]) * 0.5 16 | scaled_anchor = anchor.copy() 17 | scaled_anchor[0] = x_ctr - w / 2 # xmin 18 | scaled_anchor[2] = x_ctr + w / 2 # xmax 19 | scaled_anchor[1] = y_ctr - h / 2 # ymin 20 | scaled_anchor[3] = y_ctr + h / 2 # ymax 21 | return scaled_anchor 22 | 23 | 24 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 25 | scales=2**np.arange(3, 6)): 26 | heights = [11, 16, 23, 33, 48, 68, 97, 139, 198, 283] 27 | widths = [16] 28 | sizes = [] 29 | for h in heights: 30 | for w in widths: 31 | sizes.append((h, w)) 32 | return generate_basic_anchors(sizes) 33 | 34 | if __name__ == '__main__': 35 | import time 36 | t = time.time() 37 | a = generate_anchors() 38 | print(time.time() - t) 39 | print(a) 40 | from IPython import embed; embed() 41 | -------------------------------------------------------------------------------- /api/lib/rpn_msr/proposal_layer_tf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | from .generate_anchors import generate_anchors 4 | from lib.fast_rcnn.config import cfg 5 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 6 | from lib.fast_rcnn.nms_wrapper import nms 7 | 8 | 9 | DEBUG = False 10 | """ 11 | Outputs object detection proposals by applying estimated bounding-box 12 | transformations to a set of regular boxes (called "anchors"). 13 | """ 14 | def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [16,]): 15 | """ 16 | Parameters 17 | ---------- 18 | rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg 19 | NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! 20 | rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN 21 | im_info: a list of [image_height, image_width, scale_ratios] 22 | cfg_key: 'TRAIN' or 'TEST' 23 | _feat_stride: the downsampling ratio of feature map to the original input image 24 | anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) 25 | ---------- 26 | Returns 27 | ---------- 28 | rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] 29 | 30 | # Algorithm: 31 | # 32 | # for each (H, W) location i 33 | # generate A anchor boxes centered on cell i 34 | # apply predicted bbox deltas at cell i to each of the A anchors 35 | # clip predicted boxes to image 36 | # remove predicted boxes with either height or width < threshold 37 | # sort all (proposal, score) pairs by score from highest to lowest 38 | # take top pre_nms_topN proposals before NMS 39 | # apply NMS with threshold 0.7 to remaining proposals 40 | # take after_nms_topN proposals after NMS 41 | # return the top proposals (-> RoIs top, scores top) 42 | #layer_params = yaml.load(self.param_str_) 43 | 44 | """ 45 | # cfg_key=cfg_key.decode('ascii') 46 | _anchors = generate_anchors(scales=np.array(anchor_scales))#生成基本的9个anchor 47 | _num_anchors = _anchors.shape[0]#9个anchor 48 | 49 | im_info = im_info[0]#原始图像的高宽、缩放尺度 50 | 51 | assert rpn_cls_prob_reshape.shape[0] == 1, \ 52 | 'Only single item batches are supported' 53 | 54 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N#12000,在做nms之前,最多保留的候选box数目 55 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N#2000,做完nms之后,最多保留的box的数目 56 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH#nms用参数,阈值是0.7 57 | min_size = cfg[cfg_key].RPN_MIN_SIZE#候选box的最小尺寸,目前是16,高宽均要大于16 58 | #TODO 后期需要修改这个最小尺寸,改为8? 59 | 60 | height, width = rpn_cls_prob_reshape.shape[1:3]#feature-map的高宽 61 | 62 | # the first set of _num_anchors channels are bg probs 63 | # the second set are the fg probs, which we want 64 | # (1, H, W, A) 65 | scores = np.reshape(np.reshape(rpn_cls_prob_reshape, [1, height, width, _num_anchors, 2])[:,:,:,:,1], 66 | [1, height, width, _num_anchors]) 67 | #提取到object的分数,non-object的我们不关心 68 | #并reshape到1*H*W*9 69 | 70 | bbox_deltas = rpn_bbox_pred#模型输出的pred是相对值,需要进一步处理成真实图像中的坐标 71 | #im_info = bottom[2].data[0, :] 72 | 73 | if DEBUG: 74 | print('im_size: ({}, {})'.format(im_info[0], im_info[1])) 75 | print('scale: {}'.format(im_info[2])) 76 | 77 | # 1. Generate proposals from bbox deltas and shifted anchors 78 | if DEBUG: 79 | print('score map size: {}'.format(scores.shape)) 80 | 81 | # Enumerate all shifts 82 | # 同anchor-target-layer-tf这个文件一样,生成anchor的shift,进一步得到整张图像上的所有anchor 83 | shift_x = np.arange(0, width) * _feat_stride 84 | shift_y = np.arange(0, height) * _feat_stride 85 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 86 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 87 | shift_x.ravel(), shift_y.ravel())).transpose() 88 | 89 | # Enumerate all shifted anchors: 90 | # 91 | # add A anchors (1, A, 4) to 92 | # cell K shifts (K, 1, 4) to get 93 | # shift anchors (K, A, 4) 94 | # reshape to (K*A, 4) shifted anchors 95 | A = _num_anchors 96 | K = shifts.shape[0] 97 | anchors = _anchors.reshape((1, A, 4)) + \ 98 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 99 | anchors = anchors.reshape((K * A, 4))#这里得到的anchor就是整张图像上的所有anchor 100 | 101 | # Transpose and reshape predicted bbox transformations to get them 102 | # into the same order as the anchors: 103 | # bbox deltas will be (1, 4 * A, H, W) format 104 | # transpose to (1, H, W, 4 * A) 105 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 106 | # in slowest to fastest order 107 | bbox_deltas = bbox_deltas.reshape((-1, 4)) #(HxWxA, 4) 108 | 109 | # Same story for the scores: 110 | scores = scores.reshape((-1, 1)) 111 | 112 | # Convert anchors into proposals via bbox transformations 113 | proposals = bbox_transform_inv(anchors, bbox_deltas)#做逆变换,得到box在图像上的真实坐标 114 | 115 | # 2. clip predicted boxes to image 116 | proposals = clip_boxes(proposals, im_info[:2])#将所有的proposal修建一下,超出图像范围的将会被修剪掉 117 | 118 | # 3. remove predicted boxes with either height or width < threshold 119 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 120 | keep = _filter_boxes(proposals, min_size * im_info[2])#移除那些proposal小于一定尺寸的proposal 121 | proposals = proposals[keep, :]#保留剩下的proposal 122 | scores = scores[keep] 123 | bbox_deltas=bbox_deltas[keep,:] 124 | 125 | 126 | # # remove irregular boxes, too fat too tall 127 | # keep = _filter_irregular_boxes(proposals) 128 | # proposals = proposals[keep, :] 129 | # scores = scores[keep] 130 | 131 | # 4. sort all (proposal, score) pairs by score from highest to lowest 132 | # 5. take top pre_nms_topN (e.g. 6000) 133 | order = scores.ravel().argsort()[::-1]#score按得分的高低进行排序 134 | if pre_nms_topN > 0: #保留12000个proposal进去做nms 135 | order = order[:pre_nms_topN] 136 | proposals = proposals[order, :] 137 | scores = scores[order] 138 | bbox_deltas=bbox_deltas[order,:] 139 | 140 | 141 | # 6. apply nms (e.g. threshold = 0.7) 142 | # 7. take after_nms_topN (e.g. 300) 143 | # 8. return the top proposals (-> RoIs top) 144 | keep = nms(np.hstack((proposals, scores)), nms_thresh)#进行nms操作,保留2000个proposal 145 | if post_nms_topN > 0: 146 | keep = keep[:post_nms_topN] 147 | proposals = proposals[keep, :] 148 | scores = scores[keep] 149 | bbox_deltas=bbox_deltas[keep,:] 150 | 151 | 152 | # Output rois blob 153 | # Our RPN implementation only supports a single input image, so all 154 | # batch inds are 0 155 | blob = np.hstack((scores.astype(np.float32, copy=False), proposals.astype(np.float32, copy=False))) 156 | 157 | return blob,bbox_deltas 158 | 159 | 160 | def _filter_boxes(boxes, min_size): 161 | """Remove all boxes with any side smaller than min_size.""" 162 | ws = boxes[:, 2] - boxes[:, 0] + 1 163 | hs = boxes[:, 3] - boxes[:, 1] + 1 164 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 165 | return keep 166 | 167 | def _filter_irregular_boxes(boxes, min_ratio = 0.2, max_ratio = 5): 168 | """Remove all boxes with any side smaller than min_size.""" 169 | ws = boxes[:, 2] - boxes[:, 0] + 1 170 | hs = boxes[:, 3] - boxes[:, 1] + 1 171 | rs = ws / hs 172 | keep = np.where((rs <= max_ratio) & (rs >= min_ratio))[0] 173 | return keep 174 | -------------------------------------------------------------------------------- /api/lib/text_connector/__init__.py: -------------------------------------------------------------------------------- 1 | from .detectors import TextDetector 2 | from .text_connect_cfg import Config 3 | -------------------------------------------------------------------------------- /api/lib/text_connector/detectors.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import numpy as np 3 | from lib.fast_rcnn.nms_wrapper import nms 4 | from lib.fast_rcnn.config import cfg 5 | from .text_proposal_connector import TextProposalConnector 6 | from .text_proposal_connector_oriented import TextProposalConnector as TextProposalConnectorOriented 7 | from .text_connect_cfg import Config as TextLineCfg 8 | 9 | 10 | class TextDetector: 11 | def __init__(self): 12 | self.mode= cfg.TEST.DETECT_MODE 13 | if self.mode == "H": 14 | self.text_proposal_connector=TextProposalConnector() 15 | elif self.mode == "O": 16 | self.text_proposal_connector=TextProposalConnectorOriented() 17 | 18 | 19 | def detect(self, text_proposals,scores,size): 20 | # 删除得分较低的proposal 21 | keep_inds=np.where(scores>TextLineCfg.TEXT_PROPOSALS_MIN_SCORE)[0] 22 | text_proposals, scores=text_proposals[keep_inds], scores[keep_inds] 23 | 24 | # 按得分排序 25 | sorted_indices=np.argsort(scores.ravel())[::-1] 26 | text_proposals, scores=text_proposals[sorted_indices], scores[sorted_indices] 27 | 28 | # 对proposal做nms 29 | keep_inds=nms(np.hstack((text_proposals, scores)), TextLineCfg.TEXT_PROPOSALS_NMS_THRESH) 30 | text_proposals, scores=text_proposals[keep_inds], scores[keep_inds] 31 | 32 | # 获取检测结果 33 | text_recs=self.text_proposal_connector.get_text_lines(text_proposals, scores, size) 34 | keep_inds=self.filter_boxes(text_recs) 35 | return text_recs[keep_inds] 36 | 37 | def filter_boxes(self, boxes): 38 | heights=np.zeros((len(boxes), 1), np.float) 39 | widths=np.zeros((len(boxes), 1), np.float) 40 | scores=np.zeros((len(boxes), 1), np.float) 41 | index=0 42 | for box in boxes: 43 | heights[index]=(abs(box[5]-box[1])+abs(box[7]-box[3]))/2.0+1 44 | widths[index]=(abs(box[2]-box[0])+abs(box[6]-box[4]))/2.0+1 45 | scores[index] = box[8] 46 | index += 1 47 | 48 | return np.where((widths/heights>TextLineCfg.MIN_RATIO) & (scores>TextLineCfg.LINE_MIN_SCORE) & 49 | (widths>(TextLineCfg.TEXT_PROPOSALS_WIDTH*TextLineCfg.MIN_NUM_PROPOSALS)))[0] -------------------------------------------------------------------------------- /api/lib/text_connector/other.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def threshold(coords, min_, max_): 5 | return np.maximum(np.minimum(coords, max_), min_) 6 | 7 | def clip_boxes(boxes, im_shape): 8 | """ 9 | Clip boxes to image boundaries. 10 | """ 11 | boxes[:, 0::2]=threshold(boxes[:, 0::2], 0, im_shape[1]-1) 12 | boxes[:, 1::2]=threshold(boxes[:, 1::2], 0, im_shape[0]-1) 13 | return boxes 14 | 15 | 16 | class Graph: 17 | def __init__(self, graph): 18 | self.graph=graph 19 | 20 | def sub_graphs_connected(self): 21 | sub_graphs=[] 22 | for index in range(self.graph.shape[0]): 23 | if not self.graph[:, index].any() and self.graph[index, :].any(): 24 | v=index 25 | sub_graphs.append([v]) 26 | while self.graph[v, :].any(): 27 | v=np.where(self.graph[v, :])[0][0] 28 | sub_graphs[-1].append(v) 29 | return sub_graphs 30 | 31 | -------------------------------------------------------------------------------- /api/lib/text_connector/text_connect_cfg.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | SCALE=600 3 | MAX_SCALE=1200 4 | TEXT_PROPOSALS_WIDTH=16 5 | MIN_NUM_PROPOSALS = 2 6 | MIN_RATIO=0.5 7 | LINE_MIN_SCORE=0.9 8 | MAX_HORIZONTAL_GAP=50 9 | TEXT_PROPOSALS_MIN_SCORE=0.7 10 | TEXT_PROPOSALS_NMS_THRESH=0.2 11 | MIN_V_OVERLAPS=0.7 12 | MIN_SIZE_SIM=0.7 13 | 14 | 15 | -------------------------------------------------------------------------------- /api/lib/text_connector/text_proposal_connector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .other import clip_boxes 3 | from .text_proposal_graph_builder import TextProposalGraphBuilder 4 | 5 | class TextProposalConnector: 6 | def __init__(self): 7 | self.graph_builder=TextProposalGraphBuilder() 8 | 9 | def group_text_proposals(self, text_proposals, scores, im_size): 10 | graph=self.graph_builder.build_graph(text_proposals, scores, im_size) 11 | return graph.sub_graphs_connected() 12 | 13 | def fit_y(self, X, Y, x1, x2): 14 | len(X)!=0 15 | # if X only include one point, the function will get line y=Y[0] 16 | if np.sum(X==X[0])==len(X): 17 | return Y[0], Y[0] 18 | p=np.poly1d(np.polyfit(X, Y, 1)) 19 | return p(x1), p(x2) 20 | 21 | def get_text_lines(self, text_proposals, scores, im_size): 22 | # tp=text proposal 23 | tp_groups=self.group_text_proposals(text_proposals, scores, im_size) 24 | text_lines=np.zeros((len(tp_groups), 5), np.float32) 25 | 26 | for index, tp_indices in enumerate(tp_groups): 27 | text_line_boxes=text_proposals[list(tp_indices)] 28 | 29 | x0=np.min(text_line_boxes[:, 0]) 30 | x1=np.max(text_line_boxes[:, 2]) 31 | 32 | offset=(text_line_boxes[0, 2]-text_line_boxes[0, 0])*0.5 33 | 34 | lt_y, rt_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0+offset, x1-offset) 35 | lb_y, rb_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0+offset, x1-offset) 36 | 37 | # the score of a text line is the average score of the scores 38 | # of all text proposals contained in the text line 39 | score=scores[list(tp_indices)].sum()/float(len(tp_indices)) 40 | 41 | text_lines[index, 0]=x0 42 | text_lines[index, 1]=min(lt_y, rt_y) 43 | text_lines[index, 2]=x1 44 | text_lines[index, 3]=max(lb_y, rb_y) 45 | text_lines[index, 4]=score 46 | 47 | text_lines=clip_boxes(text_lines, im_size) 48 | 49 | text_recs = np.zeros((len(text_lines), 9), np.float) 50 | index = 0 51 | for line in text_lines: 52 | xmin,ymin,xmax,ymax=line[0],line[1],line[2],line[3] 53 | text_recs[index, 0] = xmin 54 | text_recs[index, 1] = ymin 55 | text_recs[index, 2] = xmax 56 | text_recs[index, 3] = ymin 57 | text_recs[index, 4] = xmin 58 | text_recs[index, 5] = ymax 59 | text_recs[index, 6] = xmax 60 | text_recs[index, 7] = ymax 61 | text_recs[index, 8] = line[4] 62 | index = index + 1 63 | 64 | return text_recs 65 | -------------------------------------------------------------------------------- /api/lib/text_connector/text_proposal_connector_oriented.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import numpy as np 3 | from .text_proposal_graph_builder import TextProposalGraphBuilder 4 | 5 | class TextProposalConnector: 6 | """ 7 | Connect text proposals into text lines 8 | """ 9 | def __init__(self): 10 | self.graph_builder=TextProposalGraphBuilder() 11 | 12 | def group_text_proposals(self, text_proposals, scores, im_size): 13 | graph=self.graph_builder.build_graph(text_proposals, scores, im_size) 14 | return graph.sub_graphs_connected() 15 | 16 | def fit_y(self, X, Y, x1, x2): 17 | len(X)!=0 18 | # if X only include one point, the function will get line y=Y[0] 19 | if np.sum(X==X[0])==len(X): 20 | return Y[0], Y[0] 21 | p=np.poly1d(np.polyfit(X, Y, 1)) 22 | return p(x1), p(x2) 23 | 24 | def get_text_lines(self, text_proposals, scores, im_size): 25 | """ 26 | text_proposals:boxes 27 | 28 | """ 29 | # tp=text proposal 30 | tp_groups=self.group_text_proposals(text_proposals, scores, im_size)#首先还是建图,获取到文本行由哪几个小框构成 31 | 32 | text_lines=np.zeros((len(tp_groups), 8), np.float32) 33 | 34 | for index, tp_indices in enumerate(tp_groups): 35 | text_line_boxes=text_proposals[list(tp_indices)]#每个文本行的全部小框 36 | X = (text_line_boxes[:,0] + text_line_boxes[:,2]) / 2# 求每一个小框的中心x,y坐标 37 | Y = (text_line_boxes[:,1] + text_line_boxes[:,3]) / 2 38 | 39 | z1 = np.polyfit(X,Y,1)#多项式拟合,根据之前求的中心店拟合一条直线(最小二乘) 40 | 41 | x0=np.min(text_line_boxes[:, 0])#文本行x坐标最小值 42 | x1=np.max(text_line_boxes[:, 2])#文本行x坐标最大值 43 | 44 | offset=(text_line_boxes[0, 2]-text_line_boxes[0, 0])*0.5#小框宽度的一半 45 | 46 | # 以全部小框的左上角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标 47 | lt_y, rt_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0+offset, x1-offset) 48 | # 以全部小框的左下角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标 49 | lb_y, rb_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0+offset, x1-offset) 50 | 51 | score=scores[list(tp_indices)].sum()/float(len(tp_indices))#求全部小框得分的均值作为文本行的均值 52 | 53 | text_lines[index, 0]=x0 54 | text_lines[index, 1]=min(lt_y, rt_y)#文本行上端 线段 的y坐标的小值 55 | text_lines[index, 2]=x1 56 | text_lines[index, 3]=max(lb_y, rb_y)#文本行下端 线段 的y坐标的大值 57 | text_lines[index, 4]=score#文本行得分 58 | text_lines[index, 5]=z1[0]#根据中心点拟合的直线的k,b 59 | text_lines[index, 6]=z1[1] 60 | height = np.mean( (text_line_boxes[:,3]-text_line_boxes[:,1]) )#小框平均高度 61 | text_lines[index, 7]= height + 2.5 62 | 63 | text_recs = np.zeros((len(text_lines), 9), np.float) 64 | index = 0 65 | for line in text_lines: 66 | b1 = line[6] - line[7] / 2 # 根据高度和文本行中心线,求取文本行上下两条线的b值 67 | b2 = line[6] + line[7] / 2 68 | x1 = line[0] 69 | y1 = line[5] * line[0] + b1 # 左上 70 | x2 = line[2] 71 | y2 = line[5] * line[2] + b1 # 右上 72 | x3 = line[0] 73 | y3 = line[5] * line[0] + b2 # 左下 74 | x4 = line[2] 75 | y4 = line[5] * line[2] + b2 # 右下 76 | disX = x2 - x1 77 | disY = y2 - y1 78 | width = np.sqrt(disX * disX + disY * disY) # 文本行宽度 79 | 80 | fTmp0 = y3 - y1 # 文本行高度 81 | fTmp1 = fTmp0 * disY / width 82 | x = np.fabs(fTmp1 * disX / width) # 做补偿 83 | y = np.fabs(fTmp1 * disY / width) 84 | if line[5] < 0: 85 | x1 -= x 86 | y1 += y 87 | x4 += x 88 | y4 -= y 89 | else: 90 | x2 += x 91 | y2 += y 92 | x3 -= x 93 | y3 -= y 94 | text_recs[index, 0] = x1 95 | text_recs[index, 1] = y1 96 | text_recs[index, 2] = x2 97 | text_recs[index, 3] = y2 98 | text_recs[index, 4] = x3 99 | text_recs[index, 5] = y3 100 | text_recs[index, 6] = x4 101 | text_recs[index, 7] = y4 102 | text_recs[index, 8] = line[4] 103 | index = index + 1 104 | 105 | return text_recs 106 | -------------------------------------------------------------------------------- /api/lib/text_connector/text_proposal_graph_builder.py: -------------------------------------------------------------------------------- 1 | from .text_connect_cfg import Config as TextLineCfg 2 | from .other import Graph 3 | import numpy as np 4 | 5 | 6 | class TextProposalGraphBuilder: 7 | """ 8 | Build Text proposals into a graph. 9 | """ 10 | def get_successions(self, index): 11 | box=self.text_proposals[index] 12 | results=[] 13 | for left in range(int(box[0])+1, min(int(box[0])+TextLineCfg.MAX_HORIZONTAL_GAP+1, self.im_size[1])): 14 | adj_box_indices=self.boxes_table[left] 15 | for adj_box_index in adj_box_indices: 16 | if self.meet_v_iou(adj_box_index, index): 17 | results.append(adj_box_index) 18 | if len(results)!=0: 19 | return results 20 | return results 21 | 22 | def get_precursors(self, index): 23 | box=self.text_proposals[index] 24 | results=[] 25 | for left in range(int(box[0])-1, max(int(box[0]-TextLineCfg.MAX_HORIZONTAL_GAP), 0)-1, -1): 26 | adj_box_indices=self.boxes_table[left] 27 | for adj_box_index in adj_box_indices: 28 | if self.meet_v_iou(adj_box_index, index): 29 | results.append(adj_box_index) 30 | if len(results)!=0: 31 | return results 32 | return results 33 | 34 | def is_succession_node(self, index, succession_index): 35 | precursors=self.get_precursors(succession_index) 36 | if self.scores[index]>=np.max(self.scores[precursors]): 37 | return True 38 | return False 39 | 40 | def meet_v_iou(self, index1, index2): 41 | def overlaps_v(index1, index2): 42 | h1=self.heights[index1] 43 | h2=self.heights[index2] 44 | y0=max(self.text_proposals[index2][1], self.text_proposals[index1][1]) 45 | y1=min(self.text_proposals[index2][3], self.text_proposals[index1][3]) 46 | return max(0, y1-y0+1)/min(h1, h2) 47 | 48 | def size_similarity(index1, index2): 49 | h1=self.heights[index1] 50 | h2=self.heights[index2] 51 | return min(h1, h2)/max(h1, h2) 52 | 53 | return overlaps_v(index1, index2)>=TextLineCfg.MIN_V_OVERLAPS and \ 54 | size_similarity(index1, index2)>=TextLineCfg.MIN_SIZE_SIM 55 | 56 | def build_graph(self, text_proposals, scores, im_size): 57 | self.text_proposals=text_proposals 58 | self.scores=scores 59 | self.im_size=im_size 60 | self.heights=text_proposals[:, 3]-text_proposals[:, 1]+1 61 | 62 | boxes_table=[[] for _ in range(self.im_size[1])] 63 | for index, box in enumerate(text_proposals): 64 | boxes_table[int(box[0])].append(index) 65 | self.boxes_table=boxes_table 66 | 67 | graph=np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool) 68 | 69 | for index, box in enumerate(text_proposals): 70 | successions=self.get_successions(index) 71 | if len(successions)==0: 72 | continue 73 | succession_index=successions[np.argmax(scores[successions])] 74 | if self.is_succession_node(index, succession_index): 75 | # NOTE: a box can have multiple successions(precursors) if multiple successions(precursors) 76 | # have equal scores. 77 | graph[index, succession_index]=True 78 | return Graph(graph) 79 | -------------------------------------------------------------------------------- /api/lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import boxes_grid 2 | from . import blob 3 | from . import timer -------------------------------------------------------------------------------- /api/lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | 57 | def bbox_intersections( 58 | np.ndarray[DTYPE_t, ndim=2] boxes, 59 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 60 | """ 61 | For each query box compute the intersection ratio covered by boxes 62 | ---------- 63 | Parameters 64 | ---------- 65 | boxes: (N, 4) ndarray of float 66 | query_boxes: (K, 4) ndarray of float 67 | Returns 68 | ------- 69 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 70 | """ 71 | cdef unsigned int N = boxes.shape[0] 72 | cdef unsigned int K = query_boxes.shape[0] 73 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 74 | cdef DTYPE_t iw, ih, box_area 75 | cdef DTYPE_t ua 76 | cdef unsigned int k, n 77 | for k in range(K): 78 | box_area = ( 79 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 80 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 81 | ) 82 | for n in range(N): 83 | iw = ( 84 | min(boxes[n, 2], query_boxes[k, 2]) - 85 | max(boxes[n, 0], query_boxes[k, 0]) + 1 86 | ) 87 | if iw > 0: 88 | ih = ( 89 | min(boxes[n, 3], query_boxes[k, 3]) - 90 | max(boxes[n, 1], query_boxes[k, 1]) + 1 91 | ) 92 | if ih > 0: 93 | intersec[n, k] = iw * ih / box_area 94 | return intersec -------------------------------------------------------------------------------- /api/lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | """Blob helper functions.""" 2 | import numpy as np 3 | import cv2 4 | from ..fast_rcnn.config import cfg 5 | 6 | def im_list_to_blob(ims): 7 | """Convert a list of images into a network input. 8 | 9 | Assumes images are already prepared (means subtracted, BGR order, ...). 10 | """ 11 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 12 | num_images = len(ims) 13 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 14 | dtype=np.float32) 15 | for i in range(num_images): 16 | im = ims[i] 17 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 18 | 19 | return blob 20 | 21 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 22 | """Mean subtract and scale an image for use in a blob.""" 23 | im = im.astype(np.float32, copy=False) 24 | im -= pixel_means 25 | im_shape = im.shape 26 | im_size_min = np.min(im_shape[0:2]) 27 | im_size_max = np.max(im_shape[0:2]) 28 | im_scale = float(target_size) / float(im_size_min) 29 | # Prevent the biggest axis from being more than MAX_SIZE 30 | if np.round(im_scale * im_size_max) > max_size: 31 | im_scale = float(max_size) / float(im_size_max) 32 | if cfg.TRAIN.RANDOM_DOWNSAMPLE: 33 | r = 0.6 + np.random.rand() * 0.4 34 | im_scale *= r 35 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 36 | interpolation=cv2.INTER_LINEAR) 37 | 38 | return im, im_scale 39 | -------------------------------------------------------------------------------- /api/lib/utils/boxes_grid.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Subcategory CNN 3 | # Copyright (c) 2015 CVGL Stanford 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yu Xiang 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import math 10 | # TODO: make fast_rcnn irrelevant 11 | # >>>> obsolete, because it depends on sth outside of this project 12 | from ..fast_rcnn.config import cfg 13 | # <<<< obsolete 14 | 15 | def get_boxes_grid(image_height, image_width): 16 | """ 17 | Return the boxes on image grid. 18 | calling this function when cfg.IS_MULTISCALE is True, otherwise, calling rdl_roidb.prepare_roidb(imdb) instead. 19 | """ 20 | 21 | # fixed a bug, change cfg.TRAIN.SCALES to cfg.TRAIN.SCALES_BASE 22 | # coz, here needs a ratio around 1.0, not the accutual size. 23 | # height and width of the feature map 24 | if cfg.NET_NAME == 'CaffeNet': 25 | height = np.floor((image_height * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 26 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 27 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 28 | 29 | width = np.floor((image_width * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 30 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 31 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 32 | elif cfg.NET_NAME == 'VGGnet': 33 | height = np.floor(image_height * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 34 | height = np.floor(height / 2.0 + 0.5) 35 | height = np.floor(height / 2.0 + 0.5) 36 | height = np.floor(height / 2.0 + 0.5) 37 | 38 | width = np.floor(image_width * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 39 | width = np.floor(width / 2.0 + 0.5) 40 | width = np.floor(width / 2.0 + 0.5) 41 | width = np.floor(width / 2.0 + 0.5) 42 | else: 43 | assert (1), 'The network architecture is not supported in utils.get_boxes_grid!' 44 | 45 | # compute the grid box centers 46 | h = np.arange(height) 47 | w = np.arange(width) 48 | y, x = np.meshgrid(h, w, indexing='ij') 49 | centers = np.dstack((x, y)) 50 | centers = np.reshape(centers, (-1, 2)) 51 | num = centers.shape[0] 52 | 53 | # compute width and height of grid box 54 | area = cfg.TRAIN.KERNEL_SIZE * cfg.TRAIN.KERNEL_SIZE 55 | aspect = cfg.TRAIN.ASPECTS # height / width 56 | num_aspect = len(aspect) 57 | widths = np.zeros((1, num_aspect), dtype=np.float32) 58 | heights = np.zeros((1, num_aspect), dtype=np.float32) 59 | for i in range(num_aspect): 60 | widths[0,i] = math.sqrt(area / aspect[i]) 61 | heights[0,i] = widths[0,i] * aspect[i] 62 | 63 | # construct grid boxes 64 | centers = np.repeat(centers, num_aspect, axis=0) 65 | widths = np.tile(widths, num).transpose() 66 | heights = np.tile(heights, num).transpose() 67 | 68 | x1 = np.reshape(centers[:,0], (-1, 1)) - widths * 0.5 69 | x2 = np.reshape(centers[:,0], (-1, 1)) + widths * 0.5 70 | y1 = np.reshape(centers[:,1], (-1, 1)) - heights * 0.5 71 | y2 = np.reshape(centers[:,1], (-1, 1)) + heights * 0.5 72 | 73 | boxes_grid = np.hstack((x1, y1, x2, y2)) / cfg.TRAIN.SPATIAL_SCALE 74 | 75 | return boxes_grid, centers[:,0], centers[:,1] 76 | -------------------------------------------------------------------------------- /api/lib/utils/cython_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | 70 | def nms_new(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 71 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 72 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 73 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 74 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 75 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 76 | 77 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 78 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 79 | 80 | cdef int ndets = dets.shape[0] 81 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 82 | np.zeros((ndets), dtype=np.int) 83 | 84 | # nominal indices 85 | cdef int _i, _j 86 | # sorted indices 87 | cdef int i, j 88 | # temp variables for box i's (the box currently under consideration) 89 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 90 | # variables for computing overlap with box j (lower scoring box) 91 | cdef np.float32_t xx1, yy1, xx2, yy2 92 | cdef np.float32_t w, h 93 | cdef np.float32_t inter, ovr 94 | 95 | keep = [] 96 | for _i in range(ndets): 97 | i = order[_i] 98 | if suppressed[i] == 1: 99 | continue 100 | keep.append(i) 101 | ix1 = x1[i] 102 | iy1 = y1[i] 103 | ix2 = x2[i] 104 | iy2 = y2[i] 105 | iarea = areas[i] 106 | for _j in range(_i + 1, ndets): 107 | j = order[_j] 108 | if suppressed[j] == 1: 109 | continue 110 | xx1 = max(ix1, x1[j]) 111 | yy1 = max(iy1, y1[j]) 112 | xx2 = min(ix2, x2[j]) 113 | yy2 = min(iy2, y2[j]) 114 | w = max(0.0, xx2 - xx1 + 1) 115 | h = max(0.0, yy2 - yy1 + 1) 116 | inter = w * h 117 | ovr = inter / (iarea + areas[j] - inter) 118 | ovr1 = inter / iarea 119 | ovr2 = inter / areas[j] 120 | if ovr >= thresh or ovr1 > 0.95 or ovr2 > 0.95: 121 | suppressed[j] = 1 122 | 123 | return keep 124 | -------------------------------------------------------------------------------- /api/lib/utils/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /api/lib/utils/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /api/lib/utils/make.sh: -------------------------------------------------------------------------------- 1 | cython bbox.pyx 2 | cython cython_nms.pyx 3 | cython gpu_nms.pyx 4 | python setup.py build_ext --inplace 5 | rm -rf build 6 | -------------------------------------------------------------------------------- /api/lib/utils/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /api/lib/utils/setup.py: -------------------------------------------------------------------------------- 1 | from Cython.Build import cythonize 2 | import os 3 | from os.path import join as pjoin 4 | import numpy as np 5 | from distutils.core import setup 6 | from distutils.extension import Extension 7 | from Cython.Distutils import build_ext 8 | 9 | def find_in_path(name, path): 10 | for dir in path.split(os.pathsep): 11 | binpath = pjoin(dir, name) 12 | if os.path.exists(binpath): 13 | return os.path.abspath(binpath) 14 | return None 15 | 16 | def locate_cuda(): 17 | # first check if the CUDAHOME env variable is in use 18 | if 'CUDAHOME' in os.environ: 19 | home = os.environ['CUDAHOME'] 20 | nvcc = pjoin(home, 'bin', 'nvcc') 21 | else: 22 | # otherwise, search the PATH for NVCC 23 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 24 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 25 | if nvcc is None: 26 | raise EnvironmentError('The nvcc binary could not be ' 27 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 28 | home = os.path.dirname(os.path.dirname(nvcc)) 29 | 30 | cudaconfig = {'home':home, 'nvcc':nvcc, 31 | 'include': pjoin(home, 'include'), 32 | 'lib64': pjoin(home, 'lib64')} 33 | for k, v in cudaconfig.items(): 34 | #for k, v in cudaconfig.iteritems(): 35 | if not os.path.exists(v): 36 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 37 | return cudaconfig 38 | 39 | CUDA = locate_cuda() 40 | 41 | 42 | try: 43 | numpy_include = np.get_include() 44 | except AttributeError: 45 | numpy_include = np.get_numpy_include() 46 | 47 | def customize_compiler_for_nvcc(self): 48 | self.src_extensions.append('.cu') 49 | default_compiler_so = self.compiler_so 50 | super = self._compile 51 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 52 | print(extra_postargs) 53 | if os.path.splitext(src)[1] == '.cu': 54 | # use the cuda for .cu files 55 | self.set_executable('compiler_so', CUDA['nvcc']) 56 | # use only a subset of the extra_postargs, which are 1-1 translated 57 | # from the extra_compile_args in the Extension class 58 | postargs = extra_postargs['nvcc'] 59 | else: 60 | postargs = extra_postargs['gcc'] 61 | 62 | super(obj, src, ext, cc_args, postargs, pp_opts) 63 | # reset the default compiler_so, which we might have changed for cuda 64 | self.compiler_so = default_compiler_so 65 | # inject our redefined _compile method into the class 66 | self._compile = _compile 67 | 68 | 69 | # run the customize_compiler 70 | class custom_build_ext(build_ext): 71 | def build_extensions(self): 72 | customize_compiler_for_nvcc(self.compiler) 73 | build_ext.build_extensions(self) 74 | 75 | ext_modules = [ 76 | Extension( 77 | "bbox", 78 | ["bbox.pyx"], 79 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 80 | include_dirs = [numpy_include] 81 | ), 82 | Extension( 83 | "cython_nms", 84 | ["cython_nms.pyx"], 85 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 86 | include_dirs = [numpy_include] 87 | ), 88 | Extension('gpu_nms', 89 | ['nms_kernel.cu', 'gpu_nms.pyx'], 90 | library_dirs=[CUDA['lib64']], 91 | libraries=['cudart'], 92 | language='c++', 93 | runtime_library_dirs=[CUDA['lib64']], 94 | extra_compile_args={'gcc': ["-Wno-unused-function"], 95 | 'nvcc': ['-arch=sm_35', 96 | '--ptxas-options=-v', 97 | '-c', 98 | '--compiler-options', 99 | "'-fPIC'"]}, 100 | include_dirs = [numpy_include, CUDA['include']] 101 | ), 102 | ] 103 | 104 | setup( 105 | ext_modules=ext_modules, 106 | cmdclass={'build_ext': custom_build_ext}, 107 | ) 108 | 109 | -------------------------------------------------------------------------------- /api/lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | class Timer(object): 3 | def __init__(self): 4 | self.total_time = 0. 5 | self.calls = 0 6 | self.start_time = 0. 7 | self.diff = 0. 8 | self.average_time = 0. 9 | 10 | def tic(self): 11 | self.start_time = time.time() 12 | 13 | def toc(self, average=True): 14 | self.diff = time.time() - self.start_time 15 | self.total_time += self.diff 16 | self.calls += 1 17 | self.average_time = self.total_time / self.calls 18 | if average: 19 | return self.average_time 20 | else: 21 | return self.diff 22 | -------------------------------------------------------------------------------- /api/model/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/model/model.h5 -------------------------------------------------------------------------------- /api/model/model.json: -------------------------------------------------------------------------------- 1 | {"config": {"name": "sequential_1", "layers": [{"config": {"data_format": "channels_last", "dtype": "float32", "trainable": true, "name": "conv2d_1", "activity_regularizer": null, "padding": "valid", "batch_input_shape": [null, 32, 32, 1], "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "kernel_size": [3, 3], "strides": [1, 1], "bias_regularizer": null, "activation": "relu", "kernel_regularizer": null, "dilation_rate": [1, 1], "kernel_constraint": null, "use_bias": true, "bias_constraint": null, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "filters": 64}, "class_name": "Conv2D"}, {"config": {"data_format": "channels_last", "trainable": true, "name": "conv2d_2", "activity_regularizer": null, "padding": "valid", "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "activation": "relu", "strides": [1, 1], "bias_regularizer": null, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "kernel_regularizer": null, "kernel_size": [3, 3], "dilation_rate": [1, 1], "kernel_constraint": null, "use_bias": true, "bias_constraint": null, "filters": 64}, "class_name": "Conv2D"}, {"config": {"data_format": "channels_last", "trainable": true, "pool_size": [1, 1], "padding": "valid", "name": "max_pooling2d_1", "strides": [1, 1]}, "class_name": "MaxPooling2D"}, {"config": {"data_format": "channels_last", "trainable": true, "name": "conv2d_3", "activity_regularizer": null, "padding": "valid", "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "activation": "relu", "strides": [1, 1], "bias_regularizer": null, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "kernel_regularizer": null, "kernel_size": [3, 3], "dilation_rate": [1, 1], "kernel_constraint": null, "use_bias": true, "bias_constraint": null, "filters": 128}, "class_name": "Conv2D"}, {"config": {"data_format": "channels_last", "trainable": true, "name": "conv2d_4", "activity_regularizer": null, "padding": "valid", "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "activation": "relu", "strides": [1, 1], "bias_regularizer": null, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "kernel_regularizer": null, "kernel_size": [3, 3], "dilation_rate": [1, 1], "kernel_constraint": null, "use_bias": true, "bias_constraint": null, "filters": 128}, "class_name": "Conv2D"}, {"config": {"data_format": "channels_last", "trainable": true, "pool_size": [3, 3], "padding": "valid", "name": "max_pooling2d_2", "strides": [3, 3]}, "class_name": "MaxPooling2D"}, {"config": {"seed": null, "trainable": true, "name": "dropout_1", "noise_shape": null, "rate": 0.5}, "class_name": "Dropout"}, {"config": {"data_format": "channels_last", "trainable": true, "name": "conv2d_5", "activity_regularizer": null, "padding": "valid", "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "activation": "relu", "strides": [1, 1], "bias_regularizer": null, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "kernel_regularizer": null, "kernel_size": [3, 3], "dilation_rate": [1, 1], "kernel_constraint": null, "use_bias": true, "bias_constraint": null, "filters": 256}, "class_name": "Conv2D"}, {"config": {"data_format": "channels_last", "trainable": true, "pool_size": [2, 2], "padding": "valid", "name": "max_pooling2d_3", "strides": [2, 2]}, "class_name": "MaxPooling2D"}, {"config": {"data_format": "channels_last", "trainable": true, "name": "flatten_1"}, "class_name": "Flatten"}, {"config": {"trainable": true, "name": "dense_1", "activity_regularizer": null, "units": 256, "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "bias_regularizer": null, "activation": "relu", "kernel_regularizer": null, "kernel_constraint": null, "bias_constraint": null, "use_bias": true}, "class_name": "Dense"}, {"config": {"seed": null, "trainable": true, "name": "dropout_2", "noise_shape": null, "rate": 0.5}, "class_name": "Dropout"}, {"config": {"trainable": true, "name": "dense_2", "activity_regularizer": null, "units": 128, "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "bias_regularizer": null, "activation": "relu", "kernel_regularizer": null, "kernel_constraint": null, "bias_constraint": null, "use_bias": true}, "class_name": "Dense"}, {"config": {"trainable": true, "name": "dense_3", "activity_regularizer": null, "units": 10, "kernel_initializer": {"config": {"scale": 1.0, "distribution": "uniform", "mode": "fan_avg", "seed": null}, "class_name": "VarianceScaling"}, "bias_initializer": {"config": {}, "class_name": "Zeros"}, "bias_regularizer": null, "activation": "softmax", "kernel_regularizer": null, "kernel_constraint": null, "bias_constraint": null, "use_bias": true}, "class_name": "Dense"}]}, "keras_version": "2.2.4", "class_name": "Sequential", "backend": "tensorflow"} -------------------------------------------------------------------------------- /api/outputs.txt: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | 3 | ######################## Raw Output ############################# 4 | 5 | Govermment of India 6 | Suman Goyal 7 | / 008 01/01/1979 8 | / FEMALE 9 | 8144 5625 3341 10 | Scanned by CamScanner 11 | 12 | 13 | ######################## Cleaned Output ############################# 14 | 15 | Aadhar No : 8144 5625 3341 16 | Name : Govermment of India 17 | Date of Birth : 01/01/1979 18 | Gender : Female 19 | ########################################################################## 20 | 21 | -------------------------------------------------------------------------------- /api/ref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/ref.png -------------------------------------------------------------------------------- /api/server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify, render_template 2 | import os 3 | from processing import recognise_text, crop_aadhar, get_address, seven_segment, _init_model, get_labels_from_aadhar, get_labels_from_licence 4 | from cheque_details_extraction import get_micrcode, ensemble_acc_output, ensemble_ifsc_output 5 | import datetime 6 | import db 7 | from face_matching import match_faces 8 | 9 | app = Flask(__name__) 10 | 11 | # path to upload images 12 | UPLOAD_FOLDER = './UPLOAD_FOLDER/' 13 | 14 | # initializing seven segment display model 15 | _init_model() 16 | 17 | # route to uploading images of id cards 18 | @app.route('/image/upload', methods=['POST']) 19 | def index(): 20 | 21 | if request.method == 'POST': 22 | 23 | # saving current timestamp 24 | current_time = str(datetime.datetime.now()).replace('-', '_').replace(':', '_') 25 | 26 | # get the type of image that is being received 27 | image_type = request.form['type'] 28 | 29 | # setting filename that is being received to current time stamp with its directory 30 | filename = UPLOAD_FOLDER + image_type + '/' + current_time + '.png' 31 | 32 | # if the image_type folder doesn't already exist, create it 33 | if not os.path.exists(UPLOAD_FOLDER + image_type): 34 | os.mkdir(UPLOAD_FOLDER + image_type) 35 | # directory for saving faces in the id cards 36 | os.mkdir(UPLOAD_FOLDER + image_type + '/' + 'faces') 37 | 38 | # if image_type is bank cheque, preprocess accordingly 39 | if image_type == 'Bank Cheque': 40 | details = {} 41 | 42 | # get photo from android 43 | photo = request.files['photo'] 44 | photo.save(filename) 45 | 46 | # get details from the image 47 | details['MICR'] = get_micrcode(filename) 48 | details['ACC.No'] = ensemble_acc_output(filename) 49 | details['IFSC'] = ensemble_ifsc_output(filename) 50 | 51 | # return the details and the image name it is saved as 52 | return jsonify({'status':True, 'fields': details, 'image_path': filename, 'photo_path': 'none' }) 53 | 54 | # if image_type is seven segment, preprocess accordingly 55 | elif image_type == 'Seven Segment': 56 | details = {} 57 | 58 | # get photo from android 59 | photo = request.files['photo'] 60 | photo.save(filename) 61 | 62 | # get text from seven segment 63 | text = seven_segment(filename) 64 | 65 | details[0] = text 66 | 67 | # return the details and the image name it is saved as 68 | return jsonify({'status':True, 'fields': details, 'image_path': filename, 'photo_path': 'none' }) 69 | 70 | # elif image_type == 'Aadhar Back': 71 | # details = {} 72 | 73 | # # get photo from android 74 | # photo = request.files['photo'] 75 | # photo.save(filename) 76 | 77 | # crop_path = UPLOAD_FOLDER + image_type + '/temp/' + current_time + '.png' 78 | 79 | # if not os.path.exists(UPLOAD_FOLDER + image_type + '/temp'): 80 | # os.mkdir(UPLOAD_FOLDER + image_type + '/temp') 81 | 82 | # crop_aadhar(filename, crop_path) 83 | 84 | # # recognise text in the id card 85 | # data, photo_path = recognise_text(crop_path, 'none') 86 | 87 | # details = get_address(data) 88 | 89 | # os.remove(crop_path) 90 | 91 | # # return the details and the image name it is saved as 92 | # return jsonify({'status':True, 'fields': details, 'image_path': filename, 'photo_path': 'none' }) 93 | 94 | else: 95 | # setting directory for saving face in the id card 96 | photo_path = UPLOAD_FOLDER + image_type + '/' + 'faces' + '/' + current_time + '.png' 97 | 98 | # get photo from android 99 | photo = request.files['photo'] 100 | photo.save(filename) 101 | 102 | # recognise text in the id card 103 | data, photo_path = recognise_text(filename, photo_path) 104 | 105 | # extract labels from the recognised text according to the image_type 106 | if image_type == "Driving Licence": 107 | details = { idx : text for idx, text in enumerate(data) } 108 | details = get_labels_from_licence(details) 109 | elif image_type == "Aadhar Card": 110 | details = get_labels_from_aadhar(data) 111 | else: 112 | details = { idx : text for idx, text in enumerate(data) } 113 | 114 | with open('outputs.txt', 'a+') as f: 115 | f.write("##########################################################################\n\n") 116 | f.write('######################## Raw Output #############################\n\n') 117 | for value in data: 118 | f.write(str(value) + '\n') 119 | f.write('\n\n######################## Cleaned Output #############################\n\n') 120 | for key, value in details.items(): 121 | f.write(str(key) + ' : ' + str(value) + '\n') 122 | f.write("##########################################################################\n\n") 123 | 124 | # return the details and the image name and photo path it is saved as 125 | return jsonify({'status':True, 'fields': details, 'image_path': filename, 'photo_path': photo_path}) 126 | else: 127 | # if not POST, terminate 128 | return jsonify({'status':False}) 129 | 130 | # save data to database 131 | @app.route('/api/data', methods=['POST']) 132 | def saveData(): 133 | 134 | # get values as json 135 | values = request.get_json() 136 | image_type = values.get('type') 137 | data = values.get('fields') 138 | 139 | db.insert_data(image_type, args_dict = data) 140 | 141 | return jsonify({'status': True}) 142 | 143 | 144 | @app.route('/image/face_match',methods=['GET','POST']) 145 | def face_match(): 146 | 147 | # saving current timestamp 148 | current_time = str(datetime.datetime.now()) 149 | 150 | # temporary folder for saving face for face matching 151 | if not os.path.exists(UPLOAD_FOLDER + 'temp'): 152 | os.mkdir(UPLOAD_FOLDER + 'temp') 153 | 154 | # setting filename that is being received to current time stamp with its directory 155 | filename = UPLOAD_FOLDER + 'temp' + '/' + current_time + '.png' 156 | 157 | # getting the path of the saved face image 158 | photo_path = request.form['photopath'] 159 | 160 | # get live face from android 161 | photo = request.files['liveface'] 162 | photo.save(filename) 163 | 164 | # check face match and probability 165 | result, percent = match_faces(id_card_image=photo_path, ref_image=filename) 166 | 167 | # delete the temp face image 168 | os.remove(filename) 169 | 170 | # return face match prediction and percentage 171 | return jsonify({'status':str(result), 'percent': percent}) 172 | 173 | 174 | # GET 175 | @app.route('/') 176 | def home(): 177 | 178 | return render_template('index.html') 179 | 180 | 181 | # running web app in local machine 182 | if __name__ == '__main__': 183 | app.run(host='0.0.0.0', port=5000, debug=False) 184 | -------------------------------------------------------------------------------- /api/templates/aadhar_template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/templates/aadhar_template.png -------------------------------------------------------------------------------- /api/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 13 | 14 | 15 | 18 | 19 | Docify 20 | 21 | 22 | 23 | 24 |
25 | 26 | 28 | 29 | 30 | 31 | 32 | 33 |
34 | 35 | 36 | 37 | 38 | 39 | 42 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /api/templates/license_template.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/templates/license_template.jpg -------------------------------------------------------------------------------- /api/templates/pancard_template.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/templates/pancard_template.jpg -------------------------------------------------------------------------------- /api/templates/template_acc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/templates/template_acc.jpg -------------------------------------------------------------------------------- /api/templates/template_ifsc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/api/templates/template_ifsc.png -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils.unet import UNet 3 | 4 | import torch 5 | import cv2 6 | from utils.image_aug import normalization2 7 | 8 | def get_image_tensor(img_path): 9 | image = cv2.imread(img_path) 10 | image = cv2.resize(image, (360, 480)) 11 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 12 | og_image = image.copy() 13 | 14 | # Normalize the image 15 | image = normalization2(image, max=1, min=0) 16 | 17 | # HWC to CHW 18 | image = image.transpose((2, 0, 1)) 19 | image = np.expand_dims(image, axis=0) 20 | 21 | image = torch.from_numpy(image).type(torch.FloatTensor) 22 | 23 | return og_image, image 24 | 25 | def get_mask(model, device, image): 26 | image = image.to(device=device, dtype=torch.float32) 27 | 28 | mask_pred = model(image) 29 | pred = torch.sigmoid(mask_pred) 30 | pred = (pred > 0.5).float() 31 | 32 | pred = pred.squeeze() 33 | pred = pred.cpu().detach().numpy() 34 | 35 | return pred 36 | 37 | if __name__ == '__main__': 38 | 39 | unet = UNet(n_channels=3, n_classes=1) 40 | checkpoint = torch.load('17_model.pth') 41 | unet.load_state_dict(checkpoint['model_state_dict']) 42 | unet.eval() 43 | 44 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 45 | print("Device:", device) 46 | unet = unet.to(device=device) 47 | 48 | img_path = '/data/Data/midv500_data/IMG_2126.JPG' 49 | 50 | original, img_tensor = get_image_tensor(img_path) 51 | prediction = get_mask(unet, device, img_tensor) 52 | prediction = prediction.astype(int) 53 | prediction = np.expand_dims(prediction, axis=2) 54 | masked_image = original * prediction 55 | masked_image = masked_image.astype(np.uint8) 56 | tile = cv2.hconcat([original, masked_image]) 57 | 58 | tile = cv2.cvtColor(tile, cv2.COLOR_RGB2BGR) 59 | masked_image = cv2.cvtColor(masked_image, cv2.COLOR_RGB2BGR) 60 | cv2.imwrite('tile.jpg', tile) 61 | cv2.imwrite('result.jpg', masked_image) -------------------------------------------------------------------------------- /tile.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Praneet9/Docify/a936014750dedf4a6b5a84918bbbf66cd63109de/tile.jpg -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | import cv2 4 | 5 | import torch 6 | from torch.utils.data import DataLoader, random_split 7 | from torch.utils.tensorboard import SummaryWriter 8 | from torchsummary import summary 9 | from torch import optim 10 | import torch.nn as nn 11 | 12 | from utils.unet import UNet 13 | from utils.dataset import BasicDataset 14 | 15 | 16 | def train_model(model, device, 17 | img_dir, mask_dir, 18 | checkpoint_dir, 19 | checkpoint_file=None, 20 | epochs=20, lr=0.001, 21 | val_split=0.20, 22 | batch_size=1): 23 | 24 | dataset = BasicDataset(img_dir, mask_dir) 25 | val_samples = int(len(dataset) * val_split) 26 | train_samples = len(dataset) - val_samples 27 | train, val = random_split(dataset, [train_samples, val_samples]) 28 | train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=8) 29 | val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, num_workers=8, drop_last=True) 30 | 31 | writer = SummaryWriter(log_dir=checkpoint_dir, comment=f'LR_{lr}_BS_{batch_size}') 32 | global_step = 0 33 | 34 | optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-8) 35 | criterion = nn.BCEWithLogitsLoss() 36 | 37 | training_loss = [] 38 | validation_loss = [] 39 | current_epoch = 0 40 | 41 | if checkpoint_file is not None: 42 | checkpoint = torch.load(checkpoint_dir + checkpoint_file) 43 | model.load_state_dict(checkpoint['model_state_dict']) 44 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 45 | current_epoch = checkpoint['epoch'] 46 | training_loss = checkpoint['loss'] 47 | validation_loss = checkpoint['val_loss'] 48 | global_step = checkpoint['global_step'] 49 | 50 | for epoch in range(1 + current_epoch, epochs + 1): 51 | model.train() 52 | 53 | losses = [] 54 | val_losses = [] 55 | avg_val_loss = np.inf 56 | 57 | with tqdm(total=train_samples, desc=f'Epoch {epoch}/{epochs}', unit='img') as pbar: 58 | for batch in train_loader: 59 | imgs = batch['image'] 60 | true_masks = batch['mask'] 61 | 62 | imgs = imgs.to(device=device, dtype=torch.float32) 63 | mask_type = torch.float32 if model.n_classes == 1 else torch.long 64 | true_masks = true_masks.to(device=device, dtype=mask_type) 65 | 66 | masks_pred = model(imgs) 67 | loss = criterion(masks_pred, true_masks) 68 | losses.append(loss.item()) 69 | writer.add_scalar('Loss/train', sum(losses)/len(losses), global_step) 70 | 71 | pbar.set_postfix(**{'loss': sum(losses)/len(losses)}) 72 | 73 | optimizer.zero_grad() 74 | loss.backward() 75 | nn.utils.clip_grad_value_(model.parameters(), 0.1) 76 | optimizer.step() 77 | 78 | pbar.update(imgs.shape[0]) 79 | global_step += 1 80 | 81 | val_loss = 0 82 | for val_batch in val_loader: 83 | imgs, true_masks = val_batch['image'], val_batch['mask'] 84 | imgs = imgs.to(device=device, dtype=torch.float32) 85 | true_masks = true_masks.to(device=device, dtype=torch.float32) 86 | 87 | with torch.no_grad(): 88 | mask_pred = model(imgs) 89 | 90 | pred = torch.sigmoid(mask_pred) 91 | pred = (pred > 0.5).float() 92 | val_loss += criterion(masks_pred, true_masks).item() 93 | val_score = val_loss / len(val_loader) 94 | val_losses.append(val_score) 95 | avg_val_loss = sum(val_losses) / len(val_losses) 96 | pbar.set_postfix(**{'loss': sum(losses)/len(losses), 'val_loss': avg_val_loss}) 97 | 98 | training_loss.append(sum(losses)/len(losses)) 99 | validation_loss.append(avg_val_loss) 100 | 101 | torch.save({ 102 | 'epoch': epoch, 103 | 'model_state_dict': model.state_dict(), 104 | 'optimizer_state_dict': optimizer.state_dict(), 105 | 'loss': training_loss, 106 | 'val_loss': validation_loss, 107 | 'global_step': global_step 108 | }, checkpoint_dir + str(epoch) + '_model.pth') 109 | 110 | writer.close() 111 | 112 | if __name__ == '__main__': 113 | 114 | IMAGES_PATH = '/data/Data/midv500_data/dataset/images_resized/' 115 | MASKS_PATH = '/data/Data/midv500_data/dataset/masks_resized/' 116 | MODEL_CHECKPOINT_PATH = '/data/Data/midv500_data/dataset/checkpoints/' 117 | 118 | dataset = BasicDataset(IMAGES_PATH, MASKS_PATH) 119 | 120 | unet = UNet(n_channels=3, n_classes=1) 121 | summary(unet.cuda(), (3, 480, 360)) 122 | 123 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 124 | print("Device:", device) 125 | unet = unet.to(device=device) 126 | 127 | train_model(unet, 128 | device, 129 | IMAGES_PATH, 130 | MASKS_PATH, 131 | MODEL_CHECKPOINT_PATH) -------------------------------------------------------------------------------- /utils/dataset.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import splitext 3 | from glob import glob 4 | 5 | from utils.image_aug import flip, add_gaussian_noise, add_uniform_noise, change_brightness, normalization2 6 | 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | import numpy as np 11 | from random import randint 12 | import cv2 13 | 14 | class BasicDataset(Dataset): 15 | def __init__(self, imgs_dir, masks_dir, mask_suffix=''): 16 | self.imgs_dir = imgs_dir 17 | self.masks_dir = masks_dir 18 | self.mask_suffix = mask_suffix 19 | self.height = 480 20 | self.width = 360 21 | 22 | self.ids = [splitext(file)[0] for file in listdir(imgs_dir) 23 | if not file.startswith('.')] 24 | 25 | def __len__(self): 26 | return len(self.ids) 27 | 28 | def preprocess(self, img, mask): 29 | 30 | # Augmentation 31 | # flip {0: vertical, 1: horizontal, 2: both, 3: none} 32 | flip_num = randint(0, 3) 33 | img = flip(img, flip_num) 34 | mask = flip(mask, flip_num) 35 | 36 | # Noise Determine {0: Gaussian_noise, 1: uniform_noise 37 | if randint(0, 1): 38 | # Gaussian_noise 39 | gaus_sd, gaus_mean = randint(0, 20), 0 40 | img = add_gaussian_noise(img, gaus_mean, gaus_sd) 41 | else: 42 | # uniform_noise 43 | l_bound, u_bound = randint(-20, 0), randint(0, 20) 44 | img = add_uniform_noise(img, l_bound, u_bound) 45 | 46 | # Brightness 47 | pix_add = randint(-20, 20) 48 | img = change_brightness(img, pix_add) 49 | 50 | # Normalize the image 51 | img = normalization2(img, max=1, min=0) 52 | 53 | 54 | # Normalize mask to only 0 and 1 55 | mask = mask/255 56 | # msk_as_np = np.expand_dims(msk_as_np, axis=0) # add additional dimension 57 | 58 | if len(mask.shape) == 2: 59 | mask = np.expand_dims(mask, axis=2) 60 | 61 | # HWC to CHW 62 | img = img.transpose((2, 0, 1)) 63 | mask = mask.transpose((2, 0, 1)) 64 | 65 | return img, mask 66 | 67 | def __getitem__(self, i): 68 | idx = self.ids[i] 69 | mask_file = glob(self.masks_dir + idx + self.mask_suffix + '.*') 70 | img_file = glob(self.imgs_dir + idx + '.*') 71 | 72 | assert len(mask_file) == 1, \ 73 | f'Either no mask or multiple masks found for the ID {idx}: {mask_file}' 74 | assert len(img_file) == 1, \ 75 | f'Either no image or multiple images found for the ID {idx}: {img_file}' 76 | 77 | mask = cv2.imread(mask_file[0]) 78 | img = cv2.imread(img_file[0]) 79 | 80 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 81 | mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY) 82 | mask = cv2.threshold(mask, 100, 255, cv2.THRESH_BINARY)[1] 83 | 84 | img, mask = self.preprocess(img, mask) 85 | 86 | return { 87 | 'image': torch.from_numpy(img).type(torch.FloatTensor), 88 | 'mask': torch.from_numpy(mask).type(torch.FloatTensor) 89 | } -------------------------------------------------------------------------------- /utils/image_aug.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from random import randint 3 | 4 | 5 | def flip(image, option_value): 6 | """ 7 | Args: 8 | image : numpy array of image 9 | option_value = random integer between 0 to 3 10 | Return : 11 | image : numpy array of flipped image 12 | """ 13 | if option_value == 0: 14 | # vertical 15 | image = np.flip(image, option_value) 16 | elif option_value == 1: 17 | # horizontal 18 | image = np.flip(image, option_value) 19 | elif option_value == 2: 20 | # horizontally and vertically flip 21 | image = np.flip(image, 0) 22 | image = np.flip(image, 1) 23 | else: 24 | image = image 25 | # no effect 26 | return image 27 | 28 | def add_gaussian_noise(image, mean=0, std=1): 29 | """ 30 | Args: 31 | image : numpy array of image 32 | mean : pixel mean of image 33 | standard deviation : pixel standard deviation of image 34 | Return : 35 | image : numpy array of image with gaussian noise added 36 | """ 37 | gaus_noise = np.random.normal(mean, std, image.shape) 38 | image = image.astype("int16") 39 | noise_img = image + gaus_noise 40 | image = ceil_floor_image(image) 41 | return noise_img 42 | 43 | def add_uniform_noise(image, low=-10, high=10): 44 | """ 45 | Args: 46 | image : numpy array of image 47 | low : lower boundary of output interval 48 | high : upper boundary of output interval 49 | Return : 50 | image : numpy array of image with uniform noise added 51 | """ 52 | uni_noise = np.random.uniform(low, high, image.shape) 53 | image = image.astype("int16") 54 | noise_img = image + uni_noise 55 | image = ceil_floor_image(image) 56 | return noise_img 57 | 58 | def change_brightness(image, value): 59 | """ 60 | Args: 61 | image : numpy array of image 62 | value : brightness 63 | Return : 64 | image : numpy array of image with brightness added 65 | """ 66 | image = image.astype("int16") 67 | image = image + value 68 | image = ceil_floor_image(image) 69 | return image 70 | 71 | def ceil_floor_image(image): 72 | """ 73 | Args: 74 | image : numpy array of image in datatype int16 75 | Return : 76 | image : numpy array of image in datatype uint8 with ceilling(maximum 255) and flooring(minimum 0) 77 | """ 78 | image[image > 255] = 255 79 | image[image < 0] = 0 80 | image = image.astype("uint8") 81 | return image 82 | 83 | def approximate_image(image): 84 | """ 85 | Args: 86 | image : numpy array of image in datatype int16 87 | Return : 88 | image : numpy array of image in datatype uint8 only with 255 and 0 89 | """ 90 | image[image > 127.5] = 255 91 | image[image < 127.5] = 0 92 | image = image.astype("uint8") 93 | return image 94 | 95 | def normalization2(image, max, min): 96 | """Normalization to range of [min, max] 97 | Args : 98 | image : numpy array of image 99 | mean : 100 | Return : 101 | image : numpy array of image with values turned into standard scores 102 | """ 103 | image_new = (image - np.min(image))*(max - min)/(np.max(image)-np.min(image)) + min 104 | return image_new 105 | 106 | def stride_size(image_len, crop_num, crop_size): 107 | """return stride size 108 | Args : 109 | image_len(int) : length of one size of image (width or height) 110 | crop_num(int) : number of crop in certain direction 111 | crop_size(int) : size of crop 112 | Return : 113 | stride_size(int) : stride size 114 | """ 115 | return int((image_len - crop_size)/(crop_num - 1)) 116 | 117 | # def multi_cropping(image, crop_size, crop_num1, crop_num2): 118 | # """crop the image and pad it to in_size 119 | # Args : 120 | # images : numpy arrays of images 121 | # crop_size(int) : size of cropped image 122 | # crop_num2 (int) : number of crop in horizontal way 123 | # crop_num1 (int) : number of crop in vertical way 124 | # Return : 125 | # cropped_imgs : numpy arrays of stacked images 126 | # """ 127 | 128 | # img_height, img_width = image.shape[0], image.shape[1] 129 | # assert crop_size*crop_num1 >= img_width and crop_size * \ 130 | # crop_num2 >= img_height, "Whole image cannot be sufficiently expressed" 131 | # assert crop_num1 <= img_width - crop_size + 1 and crop_num2 <= img_height - \ 132 | # crop_size + 1, "Too many number of crops" 133 | 134 | # cropped_imgs = [] 135 | # # int((img_height - crop_size)/(crop_num1 - 1)) 136 | # dim1_stride = stride_size(img_height, crop_num1, crop_size) 137 | # # int((img_width - crop_size)/(crop_num2 - 1)) 138 | # dim2_stride = stride_size(img_width, crop_num2, crop_size) 139 | # for i in range(crop_num1): 140 | # for j in range(crop_num2): 141 | # cropped_imgs.append(cropping(image, crop_size, 142 | # dim1_stride*i, dim2_stride*j)) 143 | # return np.asarray(cropped_imgs) 144 | 145 | # def cropping(image, vert_crop_size, hort_crop_size, dim1, dim2): 146 | # """crop the image and pad it to in_size 147 | # Args : 148 | # images : numpy array of images 149 | # crop_size(int) : size of cropped image 150 | # dim1(int) : vertical location of crop 151 | # dim2(int) : horizontal location of crop 152 | # Return : 153 | # cropped_img: numpy array of cropped image 154 | # """ 155 | # cropped_img = image[dim1:dim1+vert_crop_size, dim2:dim2+hort_crop_size] 156 | # return cropped_img 157 | 158 | def add_padding(image, in_size, out_size, mode): 159 | """Pad the image to in_size 160 | Args : 161 | images : numpy array of images 162 | in_size(int) : the input_size of model 163 | out_size(int) : the output_size of model 164 | mode(str) : mode of padding 165 | Return : 166 | padded_img: numpy array of padded image 167 | """ 168 | pad_size = int((in_size - out_size)/2) 169 | padded_img = np.pad(image, pad_size, mode=mode) 170 | return padded_img 171 | 172 | 173 | def division_array(crop_size, crop_num1, crop_num2, dim1, dim2): 174 | """Make division array 175 | Args : 176 | crop_size(int) : size of cropped image 177 | crop_num2 (int) : number of crop in horizontal way 178 | crop_num1 (int) : number of crop in vertical way 179 | dim1(int) : vertical size of output 180 | dim2(int) : horizontal size_of_output 181 | Return : 182 | div_array : numpy array of numbers of 1,2,4 183 | """ 184 | div_array = np.zeros([dim1, dim2]) # make division array 185 | one_array = np.ones([crop_size, crop_size]) # one array to be added to div_array 186 | dim1_stride = stride_size(dim1, crop_num1, crop_size) # vertical stride 187 | dim2_stride = stride_size(dim2, crop_num2, crop_size) # horizontal stride 188 | for i in range(crop_num1): 189 | for j in range(crop_num2): 190 | # add ones to div_array at specific position 191 | div_array[dim1_stride*i:dim1_stride*i + crop_size, 192 | dim2_stride*j:dim2_stride*j + crop_size] += one_array 193 | return div_array 194 | 195 | def image_concatenate(image, crop_num1, crop_num2, dim1, dim2): 196 | """concatenate images 197 | Args : 198 | image : output images (should be square) 199 | crop_num2 (int) : number of crop in horizontal way (2) 200 | crop_num1 (int) : number of crop in vertical way (2) 201 | dim1(int) : vertical size of output (512) 202 | dim2(int) : horizontal size_of_output (512) 203 | Return : 204 | div_array : numpy arrays of numbers of 1,2,4 205 | """ 206 | crop_size = image.shape[1] # size of crop 207 | empty_array = np.zeros([dim1, dim2]).astype("float64") # to make sure no overflow 208 | dim1_stride = stride_size(dim1, crop_num1, crop_size) # vertical stride 209 | dim2_stride = stride_size(dim2, crop_num2, crop_size) # horizontal stride 210 | index = 0 211 | for i in range(crop_num1): 212 | for j in range(crop_num2): 213 | # add image to empty_array at specific position 214 | empty_array[dim1_stride*i:dim1_stride*i + crop_size, 215 | dim2_stride*j:dim2_stride*j + crop_size] += image[index] 216 | index += 1 217 | return empty_array -------------------------------------------------------------------------------- /utils/nn_block.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | class DualConv(nn.Module): 6 | 7 | def __init__(self, input_channels, output_channels): 8 | 9 | super().__init__() 10 | 11 | self.dual_conv = nn.Sequential( 12 | nn.Conv2d(input_channels, output_channels, kernel_size=3, padding=1), 13 | nn.BatchNorm2d(output_channels), 14 | nn.ReLU(inplace=True), 15 | nn.Conv2d(output_channels, output_channels, kernel_size=3, padding=1), 16 | nn.BatchNorm2d(output_channels), 17 | nn.ReLU(inplace=True) 18 | ) 19 | 20 | def forward(self, x): 21 | 22 | return self.dual_conv(x) 23 | 24 | class DownConv(nn.Module): 25 | 26 | def __init__(self, input_channels, output_channels): 27 | 28 | super().__init__() 29 | 30 | self.down_conv = nn.Sequential( 31 | nn.MaxPool2d(2), 32 | DualConv(input_channels, output_channels) 33 | ) 34 | 35 | def forward(self, x): 36 | 37 | return self.down_conv(x) 38 | 39 | class UpConv(nn.Module): 40 | 41 | def __init__(self, input_channels, output_channels): 42 | 43 | super().__init__() 44 | 45 | self.up_conv = nn.ConvTranspose2d(input_channels, output_channels, kernel_size=2, stride=2) 46 | self.conv = DualConv(input_channels, output_channels) 47 | 48 | def forward(self, x1, x2): 49 | 50 | x1 = self.up_conv(x1) 51 | 52 | y_pad = x2.size()[2] - x1.size()[2] 53 | x_pad = x2.size()[3] - x1.size()[3] 54 | 55 | x1 = nn.functional.pad(x1, [x_pad // 2, x_pad - x_pad // 2, 56 | y_pad // 2, y_pad - y_pad // 2]) 57 | 58 | x = torch.cat([x2, x1], dim = 1) 59 | 60 | return self.conv(x) 61 | 62 | class OutputConv(nn.Module): 63 | 64 | def __init__(self, input_channels, output_channels): 65 | 66 | super().__init__() 67 | 68 | self.conv = nn.Conv2d(input_channels, output_channels, kernel_size=1) 69 | 70 | def forward(self, x): 71 | 72 | return self.conv(x) 73 | 74 | -------------------------------------------------------------------------------- /utils/unet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from utils.nn_block import DualConv, DownConv, UpConv, OutputConv 3 | 4 | 5 | class UNet(nn.Module): 6 | 7 | def __init__(self, n_channels, n_classes): 8 | 9 | super().__init__() 10 | 11 | self.n_channels = n_channels 12 | self.n_classes = n_classes 13 | 14 | self.inp = DualConv(n_channels, 64) 15 | 16 | self.down_conv_1 = DownConv(64, 128) 17 | self.down_conv_2 = DownConv(128, 256) 18 | self.down_conv_3 = DownConv(256, 512) 19 | self.down_conv_4 = DownConv(512, 1024) 20 | 21 | self.up_conv_1 = UpConv(1024, 512) 22 | self.up_conv_2 = UpConv(512, 256) 23 | self.up_conv_3 = UpConv(256, 128) 24 | self.up_conv_4 = UpConv(128, 64) 25 | 26 | self.op_conv = OutputConv(64, n_classes) 27 | 28 | def forward(self, x): 29 | 30 | x1 = self.inp(x) 31 | 32 | x2 = self.down_conv_1(x1) 33 | x3 = self.down_conv_2(x2) 34 | x4 = self.down_conv_3(x3) 35 | x5 = self.down_conv_4(x4) 36 | 37 | x6 = self.up_conv_1(x5, x4) 38 | x7 = self.up_conv_2(x6, x3) 39 | x8 = self.up_conv_3(x7, x2) 40 | x9 = self.up_conv_4(x8, x1) 41 | 42 | result = self.op_conv(x9) 43 | 44 | return result --------------------------------------------------------------------------------