├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── feature_request.yml
    │   └── new_model_addition.yml
├── .gitignore
├── LICENSE
├── LICENSE.md
├── README.md
├── pyproject.toml
├── requirements.txt
└── yubiai
    ├── __init__.py
    ├── nlp
        ├── __init__.py
        ├── language_detection
        │   ├── README.md
        │   ├── __init__.py
        │   └── yubiLanguageDetection.py
        ├── nsfw_text
        │   ├── README.md
        │   ├── __init__.py
        │   └── textNsfwDetection.py
        ├── seq2seq
        │   ├── README.md
        │   ├── __init__.py
        │   └── util.py
        ├── tokenizer
        │   ├── README.md
        │   ├── __init__.py
        │   ├── sp2hf.py
        │   └── yubiTokenizer.py
        ├── utility
        │   ├── __init__.py
        │   └── file_handlers.py
        └── yubiEmbeddings
        │   ├── README.md
        │   ├── __init__.py
        │   ├── finetune_yubibert_classification_example.py
        │   ├── finetune_yubibert_classification_example.sh
        │   └── yubibert.py
    └── vision
        ├── __init__.py
        ├── document_image_detection
            ├── README.md
            ├── __init__.py
            └── image_classification.py
        ├── document_segmentation
            ├── README.md
            ├── __init__.py
            ├── detection-results-info.png
            ├── ground-truth-info.png
            ├── lamr.png
            ├── mAP.png
            └── segment_doc.py
        ├── skew_detection
            ├── README.md
            ├── __init__.py
            └── document_skew_detection.py
        └── utility
            ├── README.md
            ├── __init__.py
            └── preprocess.py


/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001FAB2 Bug Report"
 2 | description: Submit a bug report to help us improve models. Make sure to read [readme]https://github.com/Yubi2Community/YubiAI#readme
 3 | body:
 4 |   - type: textarea
 5 |     id: system-info
 6 |     attributes:
 7 |       label: System Info
 8 |       description: Please share your system info with us.
 9 |       placeholder: python version, torch version ...
10 |     validations:
11 |       required: true
12 | 
13 |   - type: textarea
14 |     id: who-can-help
15 |     attributes:
16 |       label: Who can help?
17 |       description: |
18 |         Your issue will be replied to more quickly if you can figure out the right person to tag with @
19 |         If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
20 |         Please tag fewer than 3 people.
21 |         
22 |         Models:
23 | 
24 |           - Yubibert: `@swapaj`
25 |           - YubiGPT: `@sanprit`
26 |           - Text Language Detection: `@venkata-ramireddy-mettu`
27 |           - Speech: `@swapaj`
28 |           - TrueCaser: `@swapaj`
29 |           - Skew Detection: `@swapaj`
30 |           - Image Preprocessing: `@sanprit`
31 |       placeholder: "@Username ..."
32 | 
33 |   - type: checkboxes
34 |     id: information-scripts-examples
35 |     attributes:
36 |       label: Information
37 |       description: 'The problem arises when using:'
38 |       options:
39 |         - label: "The official example scripts"
40 |         - label: "My own modified scripts"
41 | 
42 |   - type: checkboxes
43 |     id: information-tasks
44 |     attributes:
45 |       label: Tasks
46 |       description: "The tasks I am working on are:"
47 |       options:
48 |         - label: "An officially supported task in the `examples` folder (nlp, vision)"
49 |         - label: "My own task or dataset (give details below)"
50 | 
51 |   - type: textarea
52 |     id: reproduction
53 |     validations:
54 |       required: true
55 |     attributes:
56 |       label: Reproduction
57 |       description: |
58 |         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
59 |         If you have code snippets, error messages, stack traces please provide them here as well.
60 |         Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
61 |         Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
62 | 
63 |       placeholder: |
64 |         Steps to reproduce the behavior:
65 |           
66 |           1.
67 |           2.
68 |           3.
69 |           
70 | 
71 |   - type: textarea
72 |     id: expected-behavior
73 |     validations:
74 |       required: true
75 |     attributes:
76 |       label: Expected behavior
77 |       description: "A clear and concise description of what you would expect to happen."


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001FAB6 Feature request"
 2 | description: Submit a proposal/request for a new feature. Make sure to read [readme]https://github.com/Yubi2Community/YubiAI#readme
 3 | labels: [ "feature" ]
 4 | body:
 5 |   - type: textarea
 6 |     id: feature-request
 7 |     validations:
 8 |       required: true
 9 |     attributes:
10 |       label: Feature request
11 |       description: |
12 |         A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
13 | 
14 |   - type: textarea
15 |     id: motivation
16 |     validations:
17 |       required: true
18 |     attributes:
19 |       label: Motivation
20 |       description: |
21 |         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
22 |         
23 | 
24 |   - type: textarea
25 |     id: contribution
26 |     validations:
27 |       required: true
28 |     attributes:
29 |       label: Your contribution
30 |       description: |
31 |         Is there any way that you could help, e.g. by submitting a PR? Make sure to read [readme]https://github.com/Yubi2Community/YubiAI#readme
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new_model_addition.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F91D New model addition"
 2 | description: Submit a proposal/request to implement a new model. Make sure to read [readme]https://github.com/Yubi2Community/YubiAI#readme
 3 | labels: [ "New model" ]
 4 | 
 5 | body:
 6 |   - type: textarea
 7 |     id: description-request
 8 |     validations:
 9 |       required: true
10 |     attributes:
11 |       label: Model description
12 |       description: |
13 |         Put any and all important information relative to the model
14 | 
15 |   - type: checkboxes
16 |     id: information-tasks
17 |     attributes:
18 |       label: Open source status
19 |       description: |
20 |           Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `yubi-library`.
21 |       options:
22 |         - label: "The model implementation is available"
23 |         - label: "The model weights are available"
24 | 
25 |   - type: textarea
26 |     id: additional-info
27 |     attributes:
28 |       label: Provide useful links for the implementation
29 |       description: |
30 |         Please provide information regarding the implementation, the weights, and the authors.
31 |         Please mention the authors by @username if you're aware of their usernames.
32 |         Make sure to read [readme]https://github.com/Yubi2Community/YubiAI#readme
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | # Byte-compiled / optimized / DLL files
131 | __pycache__/
132 | *.py[cod]
133 | *$py.class
134 | 
135 | # C extensions
136 | *.so
137 | 
138 | # Distribution / packaging
139 | .Python
140 | build/
141 | develop-eggs/
142 | dist/
143 | downloads/
144 | eggs/
145 | .eggs/
146 | lib/
147 | lib64/
148 | parts/
149 | sdist/
150 | var/
151 | wheels/
152 | pip-wheel-metadata/
153 | share/python-wheels/
154 | *.egg-info/
155 | .installed.cfg
156 | *.egg
157 | MANIFEST
158 | 
159 | # PyInstaller
160 | #  Usually these files are written by a python script from a template
161 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
162 | *.manifest
163 | *.spec
164 | 
165 | # Installer logs
166 | pip-log.txt
167 | pip-delete-this-directory.txt
168 | 
169 | # Unit test / coverage reports
170 | htmlcov/
171 | .tox/
172 | .nox/
173 | .coverage
174 | .coverage.*
175 | .cache
176 | nosetests.xml
177 | coverage.xml
178 | *.cover
179 | *.py,cover
180 | .hypothesis/
181 | .pytest_cache/
182 | 
183 | # Translations
184 | *.mo
185 | *.pot
186 | 
187 | # Django stuff:
188 | *.log
189 | local_settings.py
190 | db.sqlite3
191 | db.sqlite3-journal
192 | 
193 | # Flask stuff:
194 | instance/
195 | .webassets-cache
196 | 
197 | # Scrapy stuff:
198 | .scrapy
199 | 
200 | # Sphinx documentation
201 | docs/_build/
202 | 
203 | # PyBuilder
204 | target/
205 | 
206 | # Jupyter Notebook
207 | .ipynb_checkpoints
208 | 
209 | # IPython
210 | profile_default/
211 | ipython_config.py
212 | 
213 | # pyenv
214 | .python-version
215 | 
216 | # pipenv
217 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
218 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
219 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
220 | #   install all needed dependencies.
221 | #Pipfile.lock
222 | 
223 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
224 | __pypackages__/
225 | 
226 | # Celery stuff
227 | celerybeat-schedule
228 | celerybeat.pid
229 | 
230 | # SageMath parsed files
231 | *.sage.py
232 | 
233 | # Environments
234 | .env
235 | .venv
236 | env/
237 | venv/
238 | ENV/
239 | env.bak/
240 | venv.bak/
241 | 
242 | # Spyder project settings
243 | .spyderproject
244 | .spyproject
245 | 
246 | # Rope project settings
247 | .ropeproject
248 | 
249 | # mkdocs documentation
250 | /site
251 | 
252 | # mypy
253 | .mypy_cache/
254 | .dmypy.json
255 | dmypy.json
256 | 
257 | # Pyre type checker
258 | .pyre/
259 | 
260 | #ignore csv & npy file extns
261 | *.csv
262 | *.npy


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2022 - The Yubi (CredAvenue) Team. All rights reserved.
  2 | 
  3 |                                 Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright [yyyy] [name of copyright owner]
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | Copyright 2022 - The Yubi (CredAvenue) Team. All rights reserved.
  2 | 
  3 |                                 Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright [yyyy] [name of copyright owner]
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YubiAI
 2 | 
 3 | State-of-the-art models in AI,ML,NLP & Vision for FinTech community by Yubi's Data Science Team.
 4 | <br>
 5 | <br>
 6 | 
 7 | <details>
 8 | <summary>What's new in NLP</summary>
 9 | <p>
10 | 
11 | * Oct 2022
12 |     * [YubiTokenizer trained on FinTech multilingual data](./yubiai/nlp/tokenizer/)
13 |     * [YubiBERT Micro Encoder4](./yubiai/nlp/yubiEmbeddings/)
14 | * Nov 2022
15 |     * [YubiBERT Small Encoder8](./yubiai/nlp/yubiEmbeddings/)
16 |     * [HuggingFace Supported YubiTokenizers](./yubiai/nlp/tokenizer/)
17 | * Dec 2022
18 |     * [YuLan V1 - Yubi's Text Language Detection](./yubiai/nlp/language_detection/)
19 |     * [TrueCaser Model v1](./yubiai/nlp/seq2seq/)
20 |     * [Character-2-Text generation Model v1](./yubiai/nlp/seq2seq/)
21 | * Feb 2023
22 |     * [YuLan V2 - Yubi's Text Language Detection](./yubiai/nlp/language_detection/)
23 | * Jun 2023
24 |     * [NSFW Text Detection](./yubiai/nlp/nsfw_text/)
25 | 
26 | 
27 | </p>
28 | </details>
29 | </br>
30 | 
31 | <details><summary>What's new in Vision</summary><p>
32 | 
33 | * January 2023
34 |     * [Image Augmentations (Random rotate & croppings)](./yubiai/vision/utility/)
35 |     * [Document Skew Detection v1](./yubiai/vision/skew_detection/)
36 | * March 2023
37 |     * [Fintech Document Segmentation model v1](./yubiai/vision/document_segmentation/)
38 | * April 2023
39 |     * [Fintech Document Segmentation model v2](./yubiai/vision/document_segmentation/)
40 | * Jun 2023
41 |     * [Document Image Detection](./yubiai/vision/document_image_detection/)
42 |     * [NSFW Image Detection](./yubiai/vision/document_image_detection/)
43 | </p></details>
44 | </br>
45 | <br>
46 | 
47 | ## Download Model Locations
48 | * You can download models directly from this [google-drive shared folder](https://drive.google.com/drive/folders/1JteTr9GWezVIcRJd8TJ5uOnPVRznjS7o)
49 | 
50 | ## Environment
51 | * Models files hosted on FTP server and are downloaded when constructor is called
52 | * Curently tested on `Ubuntu 20.04`, `MacOS 12.3` and `python >=3.7`
53 | 
54 | ## How to install package
55 | * Clone the git repository or download zip/tar files and unzip
56 | ```
57 | cd /parent/directory/path/of/YubiAI/
58 | ```
59 | * Install only NLP dependencies
60 | ```
61 | pip install ".[nlp]"
62 | ```
63 | * Install only Vision dependencies
64 | ```
65 | pip install ".[cv]"
66 | ```
67 | * Install both NLP & Vision dependencies
68 | ```
69 | pip install ".[nlp,cv]"
70 | ```
71 | 
72 | ## How to use package
73 | * Clone the git repository or download zip/tar files and unzip
74 | * You need to append the repo path using `sys`
75 | * Remaining import & code would remain same.
76 | 
77 | ```python
78 | 
79 | import sys
80 | sys.path.append("/parent/directory/path/of/yubiai/")
81 | 
82 | from yubiai.nlp.tokenizer.yubiTokenizer import YubiTokenizer
83 | tokenizer = YubiTokenizer()
84 | 
85 | ```
86 | 
87 | <br>
88 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "yubiai"
 7 | description = "State-of-the-art models in AI,ML,NLP & Vision for FinTech community by Yubi Data Science Team"
 8 | authors = []
 9 | readme = "README.md"
10 | license = {file = "LICENSE"}
11 | classifiers = ["License :: OSI Approved :: Apache Software License"]
12 | dynamic = ["version"]
13 | requires-python = ">=3.7"
14 | dependencies = [
15 |     "torch==1.13.1",
16 |     "gdown"
17 | ]
18 | 
19 | [project.optional-dependencies]
20 | nlp = [
21 |     "sentencepiece>=0.1.97",
22 |     "fairseq==0.12.2",
23 |     "tokenizers",
24 |     "transformers",
25 |     "sentence-transformers==2.2.2"
26 | ]
27 | 
28 | cv = [
29 |     "tensorflow==2.11.0",
30 |     "keras==2.11.0",
31 |     "detecto==1.2.2",
32 |     "opencv-python==4.7.0.68"
33 | ]
34 | 
35 | [tool.distutils.bdist_wheel]
36 | universal = true
37 | 
38 | [project.urls]
39 | Home = "https://github.com/Yubi2Community/YubiAI"
40 | 
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece>=0.1.97
2 | fairseq==0.12.2
3 | tokenizers
4 | transformers
5 | tensorflow==2.11.0
6 | keras==2.11.0
7 | detecto==1.2.2
8 | gdown
9 | 


--------------------------------------------------------------------------------
/yubiai/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Model Main Version 
  3 | __version__ = "04.23"
  4 | 
  5 | 
  6 | ### Cernunnos base path constant
  7 | import pathlib
  8 | #BASE_PATH = pathlib.Path(__file__).parent.absolute()
  9 | BASE_PATH = pathlib.Path.home().joinpath(".cache", "yubiai").absolute()
 10 | 
 11 | 
 12 | ### Hosted Model List
 13 | model_list = {
 14 |     "yulan-e8-v2" : 
 15 |                     {"url":"https://drive.google.com/file/d/1XFJTHM8MWag4QbHJ0p5vSnAGZq4iWmS8/view?usp=share_link", 
 16 |                     "filename":"yulan-e8-v2.zip"},
 17 |     "yulan-e4-v2" : 
 18 |                     {"url":"https://drive.google.com/file/d/17EEAdasvj7jIKRMMaq4YxCaetgvsVuON/view?usp=share_link", 
 19 |                     "filename":"yulan-e4-v2.zip"},
 20 |     "yulan-e4-v1" : 
 21 |                     {"url":"https://drive.google.com/file/d/1Kjn6-XOxHw9mouenbtXSp7QU95D_G-eC/view?usp=share_link", 
 22 |                     "filename":"yulan-e4-v1.zip"},
 23 |     "yubibert_e8_small" : 
 24 |                     {"url":"https://drive.google.com/file/d/1HlBQDxplP2sJU-zHQfDnM1PFuMFXit9I/view?usp=share_link", 
 25 |                     "filename":"yubibert_e8_small.zip"},
 26 |     "yubibert_e4_micro" : 
 27 |                     {"url":"https://drive.google.com/file/d/1LxpI4AerxHMV5xxA4Wcs19XwHX-7ohxA/view?usp=share_link", 
 28 |                     "filename":"yubibert_e4_micro.zip"},
 29 |     "yubi_fintech_bpe_text_tokenizer" : 
 30 |                     {"url":"https://drive.google.com/file/d/1-hBwyJzY3tx1HzqrSfo1N17sGyrC07Li/view?usp=share_link", 
 31 |                     "filename":"yubi_fintech_bpe_text_tokenizer.zip"},
 32 |     "yubi_fintech_bpe_text_tokenizer_huggingface" : 
 33 |                     {"url":"https://drive.google.com/file/d/1UPBjD_Csj-Mqhb6ujp0VisEADyCYxsWl/view?usp=share_link", 
 34 |                     "filename":"yubi_fintech_bpe_text_tokenizer_huggingface.zip"},
 35 |     "yubi_document_segmentation_v1" : 
 36 |                     {"url":"https://drive.google.com/file/d/1KFK_HdXqzVaR8B4ADO26x396c606mlPP/view?usp=share_link", 
 37 |                     "filename":"yubi_document_segmentation_v1.zip"},
 38 |     "yubi_document_segmentation_aug_v1" : 
 39 |                     {"url":"https://drive.google.com/file/d/1Z3V8nzUau3K1HozI7fYP19RxQkTdnq4f/view?usp=share_link", 
 40 |                     "filename":"yubi_document_segmentation_aug_v1.zip"},
 41 |     "yubi_document_segmentation_v2" : 
 42 |                     {"url":"https://drive.google.com/file/d/1dDt_w30P09V9VTeme1ZxKxtjg2aM4kdx/view?usp=share_link", 
 43 |                     "filename":"yubi_document_segmentation_v2.zip"},
 44 |     "TrueCaser_transformer_wmt_en_de_big_t2t" : 
 45 |                     {"url":"https://drive.google.com/file/d/1VNXrYw7eE4R5hz-rhV0ZACR_e7jOMuJL/view?usp=share_link", 
 46 |                     "filename":"TrueCaser_transformer_wmt_en_de_big_t2t.zip"},
 47 |     "SkewDetection_ResNet101V2_45-135" : 
 48 |                     {"url":"https://drive.google.com/file/d/1sVoOsDiQSbkB9E-_FhFbcvNJficGQsT-/view?usp=share_link", 
 49 |                     "filename":"SkewDetection_ResNet101V2_45-135.zip"},
 50 |     "SkewDetection_ResNet101V2_0-90" : 
 51 |                     {"url":"https://drive.google.com/file/d/1JMOU4GdP3PpubeO_oMlxNpLA811lo5Dr/view?usp=share_link", 
 52 |                     "filename":"SkewDetection_ResNet101V2_0-90.zip"},
 53 |     "Quad4Detection_ResNet101V2_45-135" : 
 54 |                     {"url":"https://drive.google.com/file/d/1LvzalJAKwQ8RA0OAg343D6rBtIqy3zin/view?usp=share_link", 
 55 |                     "filename":"Quad4Detection_ResNet101V2_45-135.zip"},
 56 |     "Quad4Detection_ResNet101V2_0-90" : 
 57 |                     {"url":"https://drive.google.com/file/d/1jmkB4tk8D5i2-LF0vuaIfcR-Pa7quX2t/view?usp=share_link", 
 58 |                     "filename":"Quad4Detection_ResNet101V2_0-90.zip"},
 59 |     "character2text_transformer_wmt_en_de_big_t2t" : 
 60 |                     {"url":"https://drive.google.com/file/d/1r3VqG4fzoq6fAGHrwCXQEpFkRJxx8RZw/view?usp=share_link", 
 61 |                     "filename":"character2text_transformer_wmt_en_de_big_t2t.zip"},
 62 |     "nsfw_detection_ResNet101V2":
 63 |                     {"url":"https://drive.google.com/file/d/1dmJ2hl0wSNaEwUTmA4h2ROFCh_L_eWpA/view?usp=sharing",
 64 |                     "filename":"nsfw_detection_ResNet101V2.zip"},
 65 |     "nsfw_detection_Xception_block_12-14":
 66 |                     {"url":"https://drive.google.com/file/d/1l7-zuQGmGLVRqiG8E1O0iepr1tXYO3iL/view?usp=sharing",
 67 |                     "filename":"nsfw_detection_Xception_block_12-14.zip"},
 68 |     "nsfw_detection_Xception_block_13-14":
 69 |                     {"url":"https://drive.google.com/file/d/1KYmvD1KncwHXZbSMh5vHexRCNJlSPyyi/view?usp=sharing",
 70 |                     "filename":"nsfw_detection_Xception_block_13-14.zip"},
 71 |     "nsfw_detection_vit-b16_layer16":
 72 |                     {"url":"https://drive.google.com/file/d/1cIHe1FPEbxetJgRfKjxjW4EVK9sw0KKO/view?usp=sharing",
 73 |                     "filename":"nsfw_detection_vit-b16_layer16.zip"},
 74 |     "doc-vs-nondoc_Xception_block_12-14":
 75 |                     {"url":"https://drive.google.com/file/d/1VHpcOXyZSJfM_QP2iZYtXJBm0L_HI5Q0/view?usp=sharing",
 76 |                     "filename":"doc-vs-nondoc_Xception_block_12-14.zip"},
 77 |     "doc-vs-nondoc_vit-b16_layer15":
 78 |                     {"url":"https://drive.google.com/file/d/1I7MaLGh6Qc3pkRlLUSIrAw5qOD4Jckbz/view?usp=sharing",
 79 |                     "filename":"doc-vs-nondoc_vit-b16_layer15.zip"},
 80 |     "text_nsfw_detection":
 81 |                     {"url":"https://drive.google.com/file/d/1As4ZojVc3ZUMHiHCMckl80VMl1rk2jxz/view?usp=sharing",
 82 |                     "filename":"text_nsfw_detection.zip"},
 83 | }
 84 | 
 85 | 
 86 | ### Download from Public Shared Yubiai google drive folder
 87 | import gdown
 88 | import os
 89 | 
 90 | def set_model_info(model_name):
 91 |     """
 92 |     Generic model zip and folder naming along with their paths
 93 |     """
 94 |     model_folder_name = model_name
 95 |     model_folder_path = "%s/models/%s" % (BASE_PATH, model_folder_name)
 96 |     model_zip_path = "%s/models/" % BASE_PATH
 97 |     model_zip_name = "%s.zip" % model_folder_name
 98 |     return model_folder_path, model_folder_name, model_zip_path, model_zip_name
 99 |     
100 | 
101 | def download_model_zip(model_name):
102 |     os.makedirs("%s/models" % (BASE_PATH), exist_ok=True)
103 |     if model_name in model_list:
104 |         model_url = model_list[model_name]['url']
105 |         model_file_name = model_list[model_name]['filename']
106 |         model_output_path = "%s/models/%s" % (BASE_PATH, model_file_name)
107 |         gdown.download(model_url, model_output_path, quiet=False, fuzzy=True)
108 |     else:
109 |         print("Please check and correct 'model_name' parameter!")
110 |     return
111 | 
112 | def verify_model_path(model_folder_path, model_folder_name, model_zip_path, model_zip_name):
113 |     """
114 |     Verify if model folder exists at default path.
115 |     If not then download the same from default ftp location
116 |     """
117 |     if os.path.exists(model_folder_path):
118 |         print("Model Path exist !!")
119 |     elif os.path.exists(f"{model_zip_path}/{model_zip_name}"):
120 |         print("Model Path exist(ZIP format) !!")
121 |         os.system("cd %s; unzip %s; rm -f %s; cd -;" % (model_zip_path, model_zip_name, model_zip_name))
122 |     else:
123 |         print("Model Path do not exist !!")
124 |         download_model_zip(model_folder_name)
125 |         os.system("cd %s; unzip %s; rm -f %s; cd -;" % (model_zip_path, model_zip_name, model_zip_name))
126 | 


--------------------------------------------------------------------------------
/yubiai/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/language_detection/README.md:
--------------------------------------------------------------------------------
  1 | # Language Detection
  2 | 
  3 | ## YuLan - Yubi Languange Detection Model
  4 | 
  5 | * Using [YubiTokenzier](../tokenizer/) Byte-Pair-Encoding tokenizer trained using fintech data.
  6 | * Support 14 most used Indian languages and Transliterated versions of these languages.
  7 | * Data consists of : News & transliterated text.
  8 | * Data size of >12Gb.
  9 | * Inference Speed of ~10-50 microseconds.
 10 | * Where it can be useful ?
 11 |     * To detect document language.
 12 |     * To detect chatbot conversation language.
 13 |     * To detect speech-bot conversation language after speech-2-text.
 14 |     * Differentiate between text vs roman-transliterated-text.
 15 | 
 16 | ## How to import and use YuLan model
 17 | * **LanguageDetection** calss takes two arguments **task_name** and **use_gpu**.
 18 |     * **task_name**: yulan model name (ex: yulan-e4-v1 or yulan-e4-v2 or yulan-e8-v2).
 19 |     * **use_gpu**: whether to use GPU or not.
 20 | * **detect_language** method takes two arguments **input_text** and  **top_k**.
 21 |     * **input_text**: input sentence.
 22 |     * **top_k**: returns top k languanges.
 23 | * **detect_language_batch** method takes two arguments **input_text_list** and  **top_k**.
 24 |     * **input_text_list**: input list of sentences.
 25 |     * **top_k**: returns top k languanges.
 26 | 
 27 | ```python
 28 | 
 29 | from yubiai.nlp.language_detection.yubiLanguageDetection import LanguageDetection
 30 | 
 31 | model = LanguageDetection(task_name="yulan-e4-v2", use_gpu=False)
 32 | 
 33 | ### English text tokenisation
 34 | text = "CredAvenue is building India’s first and largest de-facto operating system for the discovery, investment, fulfilment, and collection of any debt solution."
 35 | print(model.detect_language(input_text=text, top_k=5))
 36 | ### Output :   
 37 | #   {'input_text': 'credavenue is building india’s first and largest de-facto operating system for the discovery, investment, fulfilment, and collection of any debt solution.',
 38 | #  'language': 'English',
 39 | #  'lang_code': 'en',
 40 | #  'confidence': 0.9990460276603699,
 41 | #  'language_rank': [{'language': 'English',
 42 | #    'lang_code': 'en',
 43 | #    'confidence': 0.9990460276603699},
 44 | #   {'language': 'Gujarati',
 45 | #    'lang_code': 'gu',
 46 | #    'confidence': 0.00011310971603961661},
 47 | #   {'language': 'Transliterated Assamese',
 48 | #    'lang_code': 'tr_as',
 49 | #    'confidence': 9.215490717906505e-05},
 50 | #   {'language': 'Transliterated Telugu',
 51 | #    'lang_code': 'tr_te',
 52 | #    'confidence': 8.09595367172733e-05},
 53 | #   {'language': 'Transliterated Hindi',
 54 | #    'lang_code': 'tr_hi',
 55 | #    'confidence': 7.598803495056927e-05}]}
 56 | 
 57 | ### Batch Pridiction
 58 | batch_text = [
 59 |                 "A paragraph is a collection of words strung together to make a longer unit than a sentence.", 
 60 |                 "पत्थरों के बीच छुपे मेंढक ने दिया आंखों को धोखा, चकमा देने वाली तस्वीर ने घुमा दिया दिमाग",
 61 |                 "Meeru ela unnaru?",
 62 |                 "Namaskaram, meeru lopaliki  ravachu."
 63 |             ]
 64 | print(model.detect_language_batch(input_text_list=batch_text, top_k=2))
 65 | ### Output : 
 66 | # [{'input_text': 'A paragraph is a collection of words strung together to make a longer unit than a sentence.',
 67 | #   'language': 'English',
 68 | #   'lang_code': 'en',
 69 | #   'confidence': 0.9993940591812134,
 70 | #   'language_rank': [{'language': 'English',
 71 | #     'lang_code': 'en',
 72 | #     'confidence': 0.9993940591812134},
 73 | #    {'language': 'Hindi',
 74 | #     'lang_code': 'hi',
 75 | #     'confidence': 7.835876021999866e-05}]},
 76 | #  {'input_text': 'पत्थरों के बीच छुपे मेंढक ने दिया आंखों को धोखा, चकमा देने वाली तस्वीर ने घुमा दिया दिमाग',
 77 | #   'language': 'Hindi',
 78 | #   'lang_code': 'hi',
 79 | #   'confidence': 0.9999706745147705,
 80 | #   'language_rank': [{'language': 'Hindi',
 81 | #     'lang_code': 'hi',
 82 | #     'confidence': 0.9999706745147705},
 83 | #    {'language': 'Marathi',
 84 | #     'lang_code': 'mr',
 85 | #     'confidence': 1.7655293049756438e-05}]},
 86 | #  {'input_text': 'Meeru ela unnaru?',
 87 | #   'language': 'Transliterated Telugu',
 88 | #   'lang_code': 'tr_te',
 89 | #   'confidence': 0.9984096884727478,
 90 | #   'language_rank': [{'language': 'Transliterated Telugu',
 91 | #     'lang_code': 'tr_te',
 92 | #     'confidence': 0.9984096884727478},
 93 | #    {'language': 'Transliterated Tamil',
 94 | #     'lang_code': 'tr_ta',
 95 | #     'confidence': 0.0009650049032643437}]},
 96 | #  {'input_text': 'Namaskaram, meeru lopaliki  ravachu.',
 97 | #   'language': 'Transliterated Telugu',
 98 | #   'lang_code': 'tr_te',
 99 | #   'confidence': 0.9992443323135376,
100 | #   'language_rank': [{'language': 'Transliterated Telugu',
101 | #     'lang_code': 'tr_te',
102 | #     'confidence': 0.9992443323135376},
103 | #    {'language': 'Transliterated Tamil',
104 | #     'lang_code': 'tr_ta',
105 | #     'confidence': 0.00036794054904021323}]}]
106 | ```


--------------------------------------------------------------------------------
/yubiai/nlp/language_detection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/language_detection/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/language_detection/yubiLanguageDetection.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Mettu Venkata Ramireddy
  3 | ### Created Date : 27 December 2022
  4 | ### Credit: https://github.com/facebookresearch/fairseq/blob/main/examples/xlmr/README.md
  5 | ### Model Information
  6 | ###     Language Detection Model Trained using News data of Indian News Papers.
  7 | ###     Support 14 most used Indian languages + 13 Transliterated Versions of Indian Languages.
  8 | ###     Data consists of : News
  9 | ###     Data size of : >12Gb
 10 | ###
 11 | 
 12 | 
 13 | from yubiai import set_model_info,  verify_model_path
 14 | import os, re, torch, json
 15 | from fairseq.models.roberta import RobertaModel
 16 | from fairseq.data.data_utils import collate_tokens
 17 | from torch.nn.functional import softmax
 18 | from yubiai.nlp.utility.file_handlers import load_json
 19 | 
 20 | 
 21 | class LanguageDetection():
 22 |     """
 23 |     Generic class to call yubibert finetuned models
 24 |     Should be noted that .. this will only work when - 
 25 |         1. Added task_name.zip file on ftp server /models/yubi/ folder and you have classifier head name handy
 26 |         2. Added s3 path in /models/yubi/ folder
 27 |         3. keep following file names same 
 28 |             a. checkpoint_best.pt
 29 |             b. bin_data folder
 30 |                 i. label folder
 31 |                 ii. input0 folder
 32 |             c. sentencepiece.bpe.model, sentencepiece_vocab
 33 |     """
 34 |     def __init__(self, task_name="yulan-e4-v2", use_gpu=False):
 35 |         self.use_gpu = use_gpu
 36 | 
 37 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(task_name)
 38 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
 39 |         self.languages_supported_file_name = "languages_supported.json"
 40 |         self.languages_supported_file_path = "%s/%s" % (self.model_folder_path, self.languages_supported_file_name)
 41 |         
 42 |         self.model = self.load_model()
 43 |         self.label_fn = lambda label: self.model.task.label_dictionary.string([label + self.model.task.label_dictionary.nspecial])
 44 |         self.languages_supported = load_json(self.languages_supported_file_path)
 45 | 
 46 |     def load_model(self):
 47 |         """
 48 |         Load model from default path
 49 |         """
 50 |         model = RobertaModel.from_pretrained(self.model_folder_path, 
 51 |                                              checkpoint_file="checkpoint_best.pt", 
 52 |                                              data_name_or_path="./bin_data", 
 53 |                                              bpe="sentencepiece")
 54 |         model.eval()
 55 |         if self.use_gpu == True:
 56 |             model.cuda()
 57 |         return model
 58 | 
 59 |     def detect_language(self, input_text, top_k=5):
 60 |         """Detect Language function .. which returns classes with scores. 
 61 |         Higher the score is better.
 62 |         input_text: str
 63 |                     Sentence to predict the language.
 64 |         top_k: int
 65 |            Number(top) of predictions to return.
 66 |         """
 67 |         
 68 |         assert 1 <= top_k <= len(self.languages_supported), f"Expected k value between 1 and {len(self.languages_supported)}."
 69 |         input_text = re.sub("\s+", " ", input_text)
 70 |         input_text = input_text.lower().strip()
 71 |         tokens = self.model.encode(input_text)
 72 |         outprob = self.model.predict("language_detection", tokens[:510])
 73 |         #sft = softmax(outprob, dim=1)
 74 |         pred_log = softmax(outprob, dim=1)
 75 |         top_k_pred_log = torch.topk(input=pred_log, k=top_k, dim=1)
 76 |         top_det = []
 77 |         for lab_list, prob in list(zip(top_k_pred_log.indices.tolist(), top_k_pred_log.values.tolist())):
 78 |             pred_lab = list(map(self.label_fn, lab_list))
 79 |             top_det.append(list(zip(pred_lab, prob)))
 80 |         pred_langs = []
 81 |         for row in top_det:
 82 |             for lang, conf in row:
 83 |                 pred_langs.append({"language": self.languages_supported[lang], "lang_code": lang, "confidence":conf})
 84 |         pred_response = {
 85 |             "input_text": input_text,
 86 |             "language": pred_langs[0]['language'], 
 87 |             "lang_code": pred_langs[0]['lang_code'], 
 88 |             "confidence":pred_langs[0]['confidence'], 
 89 |             "language_rank": pred_langs
 90 |         }
 91 |         return pred_response
 92 |     
 93 |     def detect_language_batch(self, input_text_list, top_k=5):
 94 |         """Detect Language function to predict language on batch .. which returns classes with scores. 
 95 |         Higher the score is better.
 96 |         input_text_list: list
 97 |                          List of sentences to detect languange.
 98 |         top_k: int
 99 |            Number(top) of predictions to return. 
100 |         """
101 |         
102 |         assert 1 <= top_k <= len(self.languages_supported), f"Expected k value between 1 and {len(self.languages_supported)}."
103 |         batch_data = [re.sub("\s+", " ", row).lower().strip() for row in input_text_list]
104 |         batch_data = [self.model.encode(sen)[:510] for sen in batch_data]
105 |         batch_data_col = collate_tokens(batch_data, pad_idx=1)
106 |         outprob = self.model.predict("language_detection", batch_data_col)
107 |         pred_log = softmax(outprob, dim=1)
108 |         top_k_pred_log = torch.topk(input=pred_log, k=top_k, dim=1)
109 |         top_det = []
110 |         for lab_list, prob in list(zip(top_k_pred_log.indices.tolist(), top_k_pred_log.values.tolist())):
111 |             pred_lab = list(map(self.label_fn, lab_list))
112 |             top_det.append(list(zip(pred_lab, prob)))
113 |         pred_langs = []
114 |         for row in top_det:
115 |             sin_lang=[]
116 |             for lang, conf in row:
117 |                 sin_lang.append({"language": self.languages_supported[lang], "lang_code": lang, "confidence":conf})
118 |             pred_langs.append(sin_lang)
119 |         pred_response = [{"input_text":query, "language": lang_data[0]['language'], "lang_code": lang_data[0]['lang_code'], "confidence":lang_data[0]['confidence'], "language_rank": lang_data} for query, lang_data in zip(input_text_list, pred_langs)]
120 |         return pred_response
121 | 


--------------------------------------------------------------------------------
/yubiai/nlp/nsfw_text/README.md:
--------------------------------------------------------------------------------
 1 | # Text NSFW Detection
 2 | 
 3 | 
 4 | * Using [YubiTokenzier](https://github.com/credavenue/yubi_ds_capability/tree/language_detection/cernunnos/nlp/tokenizer) Byte-Pair-Encoding tokenizer trained using fintech data.
 5 | * Support 14 most used Indian languages and Transliterated versions of these languages.
 6 | * Data consists of : public comments on social media platform
 7 | * Data size of
 8 |     * > 10 million comments
 9 | * Inference Speed of ~10-50 microseconds
10 | * Where it can be useful ?
11 |     * When we want to detect NSFW text.
12 | 
13 | ## How to import and use NSFW model
14 | 
15 | ```python
16 | 
17 | from yubiai.nlp.nsfw_text.textNsfwDetection import NSFWDetection
18 | 
19 | model = NSFWDetection()
20 | 
21 | ###Class - sfw -> Safe for Work
22 | ###Class - nsfw -> Not Safe for Work
23 | 
24 | ### English text tokenisation
25 | text = "good initiatative very nice."
26 | print(model.detect_NSFW(input_text=text))
27 | ### OUTPUT :    {'sfw': 1.0, 'nsfw': 0.0}
28 | text = 'ghar ke samne kutta bokata hain isko roti do chup ho jayega'
29 | print(model.detect_NSFW(input_text=text))
30 | ### OUTPUT :    {'sfw': 0.0, 'nsfw': 1.0}
31 | ```


--------------------------------------------------------------------------------
/yubiai/nlp/nsfw_text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/nsfw_text/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/nsfw_text/textNsfwDetection.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### Author : Sanprit Nayan
 3 | ### Created Date : 31 March 2023
 4 | ### Model Information
 5 | ###     Text NSFW Detection Model Trained using pubilically avaliable comments on social media platforms.
 6 | ###     Support 14 most used Indian languages + 13 Transliterated Versions of Indian Languages.
 7 | ###     Data size : >1 cr comments
 8 | 
 9 | from yubiai import set_model_info,  verify_model_path
10 | import os, re, torch
11 | import numpy as np
12 | from fairseq.models.roberta import RobertaModel
13 | from fairseq.data.data_utils import collate_tokens
14 | from torch.nn.functional import softmax
15 | from yubiai.nlp.utility.file_handlers import load_json
16 | from collections import Counter
17 | 
18 | class NSFWDetection():
19 |     """
20 |     Generic class to call yubibert finetuned models
21 |     Should be noted that .. this will only work when - 
22 |         1. Added task_name.zip file on ftp server /models/yubi/ folder and you have classifier head name handy
23 |         2. keep following file names same 
24 |             a. checkpoint_best.pt
25 |             b. bin_data folder
26 |                 i. label folder
27 |                 ii. input0 folder
28 |             c. sentencepiece.bpe.model, sentencepiece_vocab
29 |     """
30 |     def __init__(self, task_name="text_nsfw_detection", use_gpu=False):
31 |         
32 |         self.use_gpu = use_gpu
33 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(task_name)
34 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
35 |         self.model = self.load_model()
36 |         self.label_fn = lambda label: self.model.task.label_dictionary.string([label + self.model.task.label_dictionary.nspecial])
37 | 
38 |     def load_model(self):
39 |         """
40 |         Load model from default path
41 |         """
42 |         model = RobertaModel.from_pretrained(self.model_folder_path, checkpoint_file="checkpoint_best.pt", data_name_or_path="./bin_data", bpe="sentencepiece")
43 |         model.eval()
44 |         if self.use_gpu == True:
45 |             model.cuda()
46 |         return model
47 | 
48 |     def detect_NSFW(self,input_text):
49 |         """
50 |         Generic classfication function .. which returns classes with scores. 
51 |         Higher the score is better.
52 |         """
53 |         input_text = re.sub("\s+", " ", input_text)
54 |         input_text = input_text.lower().strip()
55 |         tokens = self.model.encode(input_text)
56 |         outprob = self.model.predict("comment_classifier", tokens[:510]).tolist()[0]
57 |         min_prob = np.min(outprob)
58 |         outprob_norm = [x-min_prob for x in outprob]
59 |         sum_prob = np.sum(outprob_norm)
60 |         outprob_norm = [x*1.0/sum_prob for x in outprob_norm]
61 |         outclass = [self.label_fn(i) for i in range(len(outprob))]
62 | 
63 |         outmap_norm = dict(zip(outclass, outprob_norm))
64 |         outmap_norm = {k: v for k, v in sorted(outmap_norm.items(), key=lambda item: item[1], reverse=True)}
65 | 
66 |         outmap_og = dict(zip(outclass, outprob))
67 |         outmap_og = {k: v for k, v in sorted(outmap_og.items(), key=lambda item: item[1], reverse=True)}
68 |         
69 |         result =  dict(Counter(outmap_norm).most_common(2))
70 |         
71 |         total_score = np.sum(np.array(list(result.values())))
72 |         result = {k:v/total_score for k,v in result.items()}
73 |         out_result ={}
74 |         out_result['sfw'] = result['0']
75 |         out_result['nsfw'] = result['1']
76 |         return out_result
77 | 


--------------------------------------------------------------------------------
/yubiai/nlp/seq2seq/README.md:
--------------------------------------------------------------------------------
 1 | # Fairseq Sequence-2-Sequence Models
 2 | 
 3 | ## TrueCaser Model for English
 4 | 
 5 | * Model to convert given lowercased text to a proper true-cased text
 6 | * Trained for only English language
 7 | * Used cleaned Wikipedia Dump of ~11Gb and converted into sentences using spacy.
 8 | * Training data had .. lowercased wiki sentences as input and original sentences as output.
 9 | * Had to create a new SentencePiece Tokenizer for this specific problem. It has 32k bpe tokens.
10 | * Trained on >90 million sentences, two epochs and stopped after perplexity of ~1.15.
11 | * Model used -> `transformer_wmt_en_de_big_t2t` (It has 6 encoders and 6 decoders)
12 | * Pre-requisite (test with following version)
13 |     * `numpy>=1.23.4` (This is required as it was throwing 86 to 80 machine code error.)
14 |     * `torch>=1.13.0`
15 |     * `sentencepiece>=0.1.97`
16 |     * `fairseq=0.12.2`
17 | 
18 |     ### How to run
19 | 
20 |     ```python
21 | 
22 |     from yubiai.nlp.seq2seq.util import Seq2SeqFairseqWrapper
23 | 
24 |     seq2seq_model = Seq2SeqFairseqWrapper(use_gpu=False, model_type="TrueCaser_transformer_wmt_en_de_big_t2t")
25 | 
26 |     text = "national testing agency has issued the indian institute of foreign trade (iift) – mba (international business) 2023 exam city intimation slip."
27 |     seq2seq_model.get_translation(text, to_lower=True, to_char=False)
28 |     ### Output => National Testing Agency has issued the Indian Institute of Foreign Trade (IIFT) – MBA (International Business) 2023 exam city intimation slip.
29 | 
30 |     ```
31 | 
32 |     ### Benchmarks
33 |     Benchmarked with 10k wikipedia sentences and 10k news sentences. Used current best [truecaser library](https://pypi.org/project/truecase/). This library uses [NLTK](https://www.nltk.org/index.html) internally and references this [SOTA paper](https://www.cs.cmu.edu/~llita/papers/lita.truecasing-acl2003.pdf). 
34 |     </br> Supposedly, [Grammerly](https://app.grammarly.com/) should be taken for comparison as their plugin seems to work better, also their [paper](https://arxiv.org/pdf/1906.01733.pdf) looks better. But not able to compare it directly without any api.
35 |     </br> Following are BLEU score comparisons - 
36 |     
37 |     | Wiki-dataset  | BLEU score avg | 1-gram | 2-gram | 3-gram | 4-gram |
38 |     | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
39 |     | truecase pypi  | 82.77 | 92.2 | 85.5 | 79.8 | 74.6 |
40 |     | <b>yubiai-truecaser  | <b>97.00 | <b>98.6 | <b>97.5 | <b>96.5 | <b>95.5 |
41 | 
42 |     | News-dataset  | BLEU score avg | 1-gram | 2-gram | 3-gram | 4-gram |
43 |     | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
44 |     | truecase pypi  | 71.04 | 84.6 | 74.2 | 66.7 | 60.8 |
45 |     | <b>yubiai-truecaser  | <b>89.40 | <b>94.4 | <b>90.5 | <b>87.7 | <b>85.2 |
46 | 
47 | </br></br>
48 | 
49 | 
50 | ## Character-2-Text Model for English+Regional Language
51 | 
52 | * Model to convert given character stream to a proper true-cased text
53 | * Trained for English + top-13 Indian Regional languages
54 | * Used cleaned Wikipedia Dump of ~11Gb + Scrapped ~7Gb regional data.
55 | * Training data had .. lowercased character stream text as input and original sentences as output.
56 | * Had to create a new SentencePiece Tokenizer for this specific problem. It has 48k bpe tokens.
57 | * Trained on >100 million sentences, two epochs and stopped after perplexity of <1.2.
58 | * Model used -> `transformer_wmt_en_de_big_t2t` (It has 6 encoders and 6 decoders)
59 | * Pre-requisite (test with following version)
60 |     * `numpy>=1.23.4` (This is required as it was throwing 86 to 80 machine code error.)
61 |     * `torch>=1.13.0`
62 |     * `sentencepiece>=0.1.97`
63 |     * `fairseq=0.12.2`
64 | * Nothing similar to this model exists so could not benchmark it against any other model.
65 | 
66 |     ### How to run
67 | 
68 |     ```python
69 | 
70 |     from yubiai.nlp.seq2seq.util import Seq2SeqFairseqWrapper
71 | 
72 |     seq2seq_model = Seq2SeqFairseqWrapper(use_gpu=False, model_type="character2text_transformer_wmt_en_de_big_t2t")
73 | 
74 |     text = "y u b i i s b u i l d i n g i n d i a ’ s f i r s t a n d l a r g e s t d e - f a c t o o p e r a t i n g s y s t e m f o r t h e d i s c o v e r y , i n v e s t m e n t , f u l f i l m e n t , a n d c o l l e c t i o n o f a n y d e b t s o l u t i o n ."
75 |     seq2seq_model.get_translation(text, to_lower=True, to_char=True)
76 |     ### Output => Yubi is building India’s first and largest de-facto operating system for the discovery, investment, fulfilment, and collection of any debt solution.
77 | 
78 |     text = """क ो र ् ट न े घ ो ट ा ल े क े म ा म ल े म े ं ज ा ं च क र र ह ी य ू प ी प ु ल ि स क ी आ र ् थ ि क अ प र ा ध श ा ख ा , ई ड ी , ए स ए फ आ ई ओ क ी ज ा ं च प र अ ं स त ु ष ् ट ी ज त ा ई ।"""
79 |     seq2seq_model.get_translation(text, to_lower=True, to_char=True)
80 |     ### Output => कोर्ट ने घोटाले के कामले में जांच कररही यूपी पुलिस की आर्थिक अपराध शाखा,ईडी,एस एफ आई ओकी जांच पर अंसतुष्टी जताई।
81 | 
82 |     ```


--------------------------------------------------------------------------------
/yubiai/nlp/seq2seq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/seq2seq/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/seq2seq/util.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### Author : Swapnil Ashok Jadhav (github:swapaj)
 3 | ### Created Date : 16 Dec 2022
 4 | ###
 5 | 
 6 | 
 7 | from yubiai import set_model_info,  verify_model_path
 8 | from fairseq.models.transformer import TransformerModel
 9 | import os, re
10 | 
11 | 
12 | class Seq2SeqFairseqWrapper:
13 |     def __init__(self, use_gpu=False, model_type="TrueCaser_transformer_wmt_en_de_big_t2t"):
14 |         self.use_gpu = use_gpu
15 |         
16 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(model_type)
17 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
18 |         self.model = self.load_model()
19 |     
20 |     def load_model(self):
21 |         """
22 |         Load model from default path
23 |         """
24 |         model = TransformerModel.from_pretrained(self.model_folder_path,
25 |                                                 checkpoint_file="checkpoint_best.pt", 
26 |                                                 data_name_or_path="./bin_data", 
27 |                                                 bpe="sentencepiece")
28 |         model.eval()
29 |         if self.use_gpu == True:
30 |             model.cuda()
31 |         return model
32 | 
33 |     def get_translation(self, text, to_lower=False, to_char=False):
34 |         if to_lower == True:
35 |             text = text.lower()
36 |         if to_char == True:
37 |             text = " ".join([ch for ch in text]).strip().lower()
38 |         text = re.sub("\s+", " ", text)
39 |         text = text.strip()
40 |         translated_text = self.model.translate(text)
41 |         return translated_text
42 | 


--------------------------------------------------------------------------------
/yubiai/nlp/tokenizer/README.md:
--------------------------------------------------------------------------------
 1 | # YubiTokenizer
 2 | 
 3 | * [SentencePiece](https://github.com/google/sentencepiece) Byte-Pair-Encoding tokenizer trained using fintech data.
 4 | * Support 14 most used Indian languages : `English, Indian-English, Hindi, Assamese, Bengali, Gujarati, Kannada, Malayalam, Oriya, Punjabi, Tamil, Telugu, Urdu, Nepali, Marathi`
 5 | * Data consists of : News, pdfs, reports, wiki, speech-2-text data & transliterated text
 6 | * Data size of
 7 |     * ~50Gb (Reduced from ~220Gb of original data)
 8 |     * ~384 million lines
 9 |     * ~46 billion characters
10 | * Inference Speed of ~90-100 microseconds
11 | * Where it can be useful ?
12 |     * When we need to tokenise and index any fintech related text/document
13 |     * Replacement of [GPT-tokenizer and BERT-tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer) ... as it is specifically trained on FinTech data.
14 |     * To process indian languages text or speech
15 | 
16 | <br>
17 | 
18 | ## How to load and use YubiTokenizer
19 | 
20 | ```python
21 | 
22 | import sys
23 | sys.path.append("/parent/directory/path/of/yubiai/")
24 | 
25 | from yubiai.nlp.tokenizer.yubiTokenizer import YubiTokenizer
26 | tokenizer = YubiTokenizer()
27 | 
28 | text = "CredAvenue is building India’s first and largest de-facto operating system for the discovery, investment, fulfilment, and collection of any debt solution."
29 | 
30 | tokenizer.get_tokens(text)
31 | ### ['▁cred', 'aven', 'ue', '▁is', '▁building', '▁india', '’', 's', '▁first', '▁and', '▁largest', '▁de', '-', 'fact', 'o', '▁operating', '▁system', '▁for', '▁the', '▁discovery', ',', '▁investment', ',', '▁fulfilment', ',', '▁and', '▁collection', '▁of', '▁any', '▁debt', '▁solution', '.']
32 | 
33 | 
34 | ```
35 | 
36 | ## How to use YubiTokenizer with HuggingFace
37 | 
38 | We had to create HuggingFace supported version of our tokenizer using `sp2hf.py` as we realised SentencePiece has no direct support in training or fine-tuning models.
39 | We can either `tokenizers` version of a model OR `transformers` version of model when needed in training accordingly.
40 | 
41 | ```python
42 | 
43 | import sys
44 | sys.path.append("/parent/directory/path/of/yubiai/")
45 | 
46 | from yubiai.nlp.tokenizer.yubiTokenizer import YubiTokenizerHF
47 | tokenizer = YubiTokenizerHF()
48 | 
49 | text = "CredAvenue is building India’s first and largest de-facto operating system for the discovery, investment, fulfilment, and collection of any debt solution."
50 | 
51 | tokenizer.get_tokens(text)
52 | ### ['▁cred', 'aven', 'ue', '▁is', '▁building', '▁india', '’', 's', '▁first', '▁and', '▁largest', '▁de', '-', 'fact', 'o', '▁operating', '▁system', '▁for', '▁the', '▁discovery', ',', '▁investment', ',', '▁fulfilment', ',', '▁and', '▁collection', '▁of', '▁any', '▁debt', '▁solution', '.']
53 | 
54 | tokenizer.get_tokens_transformer(text)
55 | ### ['▁cred', 'aven', 'ue', '▁is', '▁building', '▁india', '’', 's', '▁first', '▁and', '▁largest', '▁de', '-', 'fact', 'o', '▁operating', '▁system', '▁for', '▁the', '▁discovery', ',', '▁investment', ',', '▁fulfilment', ',', '▁and', '▁collection', '▁of', '▁any', '▁debt', '▁solution', '.']
56 | 
57 | type(tokenizer.model)
58 | ### <class 'tokenizers.implementations.sentencepiece_bpe.SentencePieceBPETokenizer'>
59 | 
60 | type(tokenizer.transformer_model)
61 | ### <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
62 | 
63 | ```


--------------------------------------------------------------------------------
/yubiai/nlp/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/tokenizer/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/tokenizer/sp2hf.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from json import dump
  3 | from logging import basicConfig, getLogger
  4 | from os import linesep, remove
  5 | from os.path import exists
  6 | from tempfile import NamedTemporaryFile
  7 | from typing import Dict, List, Tuple
  8 | 
  9 | from requests import get
 10 | from sentencepiece import SentencePieceProcessor
 11 | from tqdm import trange, tqdm
 12 | 
 13 | basicConfig()
 14 | logger = getLogger()
 15 | 
 16 | 
 17 | class SentencePieceExtractor:
 18 |     """
 19 |     Extractor implementation for SentencePiece trained models.
 20 |     https://github.com/google/sentencepiece
 21 |     """
 22 | 
 23 |     def __init__(self, model: str):
 24 |         # Get SentencePiece
 25 |         self.sp = SentencePieceProcessor()
 26 |         self.sp.Load(model)
 27 | 
 28 |     def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
 29 |         sp = self.sp
 30 |         vocab = {sp.id_to_piece(index): index for index in trange(sp.GetPieceSize())}
 31 | 
 32 |         # Merges
 33 |         merges = []
 34 |         for piece_l in tqdm(vocab.keys(), total=sp.GetPieceSize()):
 35 |             for piece_r in vocab.keys():
 36 |                 merge = f"{piece_l}{piece_r}"
 37 |                 piece_id = vocab.get(merge, None)
 38 |                 if piece_id:
 39 |                     merges += [(piece_l, piece_r, piece_id)]
 40 |         merges = sorted(merges, key=lambda val: val[2])
 41 |         merges = [(val[0], val[1]) for val in merges]
 42 | 
 43 |         return vocab, merges
 44 | 
 45 | 
 46 | class YouTokenToMeExtractor:
 47 |     """
 48 |     Extractor implementation for YouTokenToMe trained models format.
 49 |     Model are as follow:
 50 |         vocab_size nb_merges
 51 |         piece piece_id
 52 |         ...(repeated vocab_size)
 53 |         piece_id_left piece_id_right piece_id
 54 |         ...(repeated nb merges)
 55 |     """
 56 | 
 57 |     def __init__(self, model: str):
 58 |         self._model = model
 59 | 
 60 |     def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
 61 |         with open(self._model, "r") as model_f:
 62 | 
 63 |             # Retrieve information
 64 |             nb_pieces, nb_merges = map(int, model_f.readline().split())
 65 |             vocab, merges = {}, []
 66 | 
 67 |             # Vocab
 68 |             for _ in trange(nb_pieces):
 69 |                 piece, piece_id = map(int, model_f.readline().split())
 70 |                 vocab[piece_id] = chr(piece)
 71 | 
 72 |             # Merges
 73 |             for _ in trange(nb_merges):
 74 |                 piece_id_l, piece_id_r, piece = map(int, model_f.readline().split())
 75 |                 piece_l, piece_r = vocab[piece_id_l], vocab[piece_id_r]
 76 |                 vocab[piece] = f"{piece_l}{piece_r}"
 77 |                 merges += [(piece_l, piece_r)]
 78 | 
 79 |             # Special tokens
 80 |             unk, pad, bos, eos = map(int, model_f.readline().split())
 81 |             vocab[unk] = "<unk>"
 82 |             vocab[pad] = "<pad>"
 83 |             vocab[bos] = "<bos>"
 84 |             vocab[eos] = "<eos>"
 85 | 
 86 |         # Invert key and value for vocab
 87 |         vocab = dict(zip(vocab.values(), vocab.keys()))
 88 |         return vocab, merges
 89 | 
 90 | 
 91 | if __name__ == "__main__":
 92 |     parser = ArgumentParser("SentencePiece vocab extractor")
 93 |     parser.add_argument(
 94 |         "--provider",
 95 |         type=str,
 96 |         required=True,
 97 |         choices=["sentencepiece", "youtokentome"],
 98 |         help="Indicate the format of the file.",
 99 |     )
100 |     parser.add_argument(
101 |         "--model", type=str, required=True, help="SentencePiece model to extract vocab from."
102 |     )
103 |     parser.add_argument(
104 |         "--vocab-output-path",
105 |         type=str,
106 |         required=True,
107 |         help="Path where the vocab.json file will be extracted",
108 |     )
109 |     parser.add_argument(
110 |         "--merges-output-path",
111 |         type=str,
112 |         required=True,
113 |         help="Path where the merges file will be extracted",
114 |     )
115 | 
116 |     # Parse cli arguments
117 |     args = parser.parse_args()
118 | 
119 |     try:
120 |         if args.model.startswith("http"):
121 |             # Saving model
122 |             with NamedTemporaryFile("wb", delete=False) as f:
123 |                 logger.info("Writing content from {} to {}".format(args.model, f.name))
124 |                 response = get(args.model, allow_redirects=True)
125 |                 f.write(response.content)
126 | 
127 |                 args.remote_model = args.model
128 |                 args.model = f.name
129 | 
130 |         # Allocate extractor
131 |         extractor = (
132 |             SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
133 |         )
134 |         extractor = extractor(args.model)
135 | 
136 |         logger.info(f"Using {type(extractor).__name__}")
137 | 
138 |         # Open output files and let's extract model information
139 |         with open(args.vocab_output_path, "w") as vocab_f:
140 |             with open(args.merges_output_path, "w") as merges_f:
141 |                 # Do the extraction
142 |                 vocab, merges = extractor.extract()
143 | 
144 |                 # Save content
145 |                 dump(vocab, vocab_f)
146 |                 merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))
147 |     finally:
148 |         # If model was downloaded from internet we need to cleanup the tmp folder.
149 |         if hasattr(args, "remote_model") and exists(args.model):
150 |             remove(args.model)
151 | 


--------------------------------------------------------------------------------
/yubiai/nlp/tokenizer/yubiTokenizer.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### Author : Swapnil Ashok Jadhav (github:swapaj)
 3 | ### Created Date : 28th Oct. 2022
 4 | ###
 5 | ### Model Information
 6 | ###     SentencePiece Byte-Pair-Encoding tokenizer trained using fintech data.
 7 | ###     Support 14 most used Indian languages. 
 8 | ###     Languages : English, Indian-English, Hindi, Assamese, Bengali, Gujarati, Kannada, Malayalam
 9 | ###                 Oriya, Punjabi, Tamil, Telugu, Urdu, Nepali, Marathi
10 | ###     Data consists of : News, pdfs, reports, wiki, speech-2-text data
11 | ###     Data size of : >50Gb (Reduced from ~220Gb of original data)
12 | ###                  : >384 million lines
13 | ###                  : >46 billion characters
14 | ###     Inference Speed of ~90-100 microseconds
15 | ###
16 | 
17 | 
18 | from yubiai import set_model_info, verify_model_path, BASE_PATH
19 | import sentencepiece as spm
20 | import os
21 | from tokenizers.implementations import SentencePieceBPETokenizer
22 | from transformers import PreTrainedTokenizerFast
23 | 
24 | 
25 | class YubiTokenizer:
26 |     def __init__(self):
27 |         self.model_folder_name = "yubi_fintech_bpe_text_tokenizer"
28 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(self.model_folder_name)
29 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
30 |         
31 |         self.model_file_name = "fintech96k_model.txt"
32 |         self.model_file_path = "%s/%s" % (self.model_folder_path,self.model_file_name)
33 |         self.model = self.load_model()
34 | 
35 |     def load_model(self):
36 |         """
37 |         Load model from default path
38 |         """
39 |         sp = spm.SentencePieceProcessor()
40 |         sp.load(self.model_file_path)
41 |         return sp
42 | 
43 |     def get_tokens(self, text):
44 |         """
45 |         Break the given text in list of tokens
46 |         """
47 |         text = text.strip().lower()
48 |         piece_list = self.model.encode_as_pieces(text)
49 |         return piece_list
50 | 
51 | class YubiTokenizerHF:
52 |     def __init__(self):
53 |         self.model_folder_name = "yubi_fintech_bpe_text_tokenizer_huggingface"
54 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(self.model_folder_name)
55 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
56 |         
57 |         self.model_file_path_vacab = "%s/sentencepiece-vocab.json" % self.model_folder_path
58 |         self.model_file_path_merges = "%s/sentencepiece-merges.txt" % self.model_folder_path
59 |         self.model = self.load_model()
60 |         self.transformer_model = self.load_transformer_model()
61 | 
62 |     def load_model(self):
63 |         """
64 |         Load model from default path
65 |         """
66 |         sp = SentencePieceBPETokenizer.from_file(vocab_filename=self.model_file_path_vacab, merges_filename=self.model_file_path_merges)
67 |         return sp
68 | 
69 |     def load_transformer_model(self):
70 |         """
71 |         Load tokenizer models which can be used in transformer training module
72 |         """
73 |         tmodel = PreTrainedTokenizerFast(tokenizer_object=self.model)
74 |         return tmodel
75 | 
76 |     def get_tokens(self, text):
77 |         """
78 |         Break the given text in list of tokens
79 |         """
80 |         text = text.strip().lower()
81 |         piece_list = self.model._tokenizer.encode(text).tokens
82 |         return piece_list
83 | 
84 |     def get_tokens_transformer(self, text):
85 |         """
86 |         Get list of tokens from transformer trainer tokenizer
87 |         """
88 |         text = text.strip().lower()
89 |         piece_list = self.transformer_model.tokenize(text)
90 |         return piece_list
91 |         


--------------------------------------------------------------------------------
/yubiai/nlp/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/utility/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/utility/file_handlers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ###
 4 | ### Author : Mettu Venkata Ramireddy (venkata-ramireddy-mettu)
 5 | ### Created Date : 27-Dec-2022
 6 | ###
 7 | 
 8 | import json
 9 | 
10 | def load_json(file_path):
11 |     with open(file=file_path, mode="r") as jsf:
12 |         return json.load(jsf)


--------------------------------------------------------------------------------
/yubiai/nlp/yubiEmbeddings/README.md:
--------------------------------------------------------------------------------
 1 | # YubiBERT
 2 | 
 3 | ## YubiBERT (yubibert_e4_micro)
 4 | * In this version of pretrained RoBERTa model - 
 5 |     * Only 4 encoder have been used for quick training and so that model can fit in 16gb of gpu ram with enough batch size
 6 |     * Other parameters are set accordingly like attention-size, attention-heads, learning-rate etc
 7 |     * 230 Gb of raw text data used 
 8 |     * Our own [yubi sentence tokenization](../tokenizer/) is used during preprocess steps
 9 | ## YubiBERT (yubibert_e8_small)
10 | * In this version of pretrained RoBERTa model - 
11 |     * 8 encoder have been used. We used `g5` instance with 4x gpu and each had 24gb vram.
12 |     * 230 Gb of raw text data used 
13 |     * Our own [yubi sentence tokenization](../tokenizer/) is used during preprocess steps
14 | 
15 | <br>
16 | 
17 | ## How to load and use YubiBERT-Micro-Encoder4
18 | 
19 | You can define `model_type` parameter to load specific model.
20 | Available model types are : `yubibert_e4_micro` and `yubibert_e8_small`
21 | 
22 | ```python
23 | 
24 | import sys
25 | sys.path.append("/parent/directory/path/of/yubiai/")
26 | 
27 | from yubiai.nlp.yubiEmbeddings.yubibert import YubiBERT
28 | 
29 | ybert = YubiBERT(use_gpu=False, model_type='yubibert_e4_micro')
30 | 
31 | input_text = "CredAvenue is India’s fastest Fintech unicorn."
32 | 
33 | ### Explore getEmbeddings_last_n_layers() function for a different approach!
34 | print(ybert.getEmbeddings(input_text))
35 | ### {'encoded_tokens': tensor([0,30371,9233,3026,18,61,33,15,5426,8605,41599,6,2]), 
36 | ###  'decoded_tokens': ['', 'cred', 'aven', 'ue', 'is', 'india', '’', 's', 'fastest', 'fintech', 'unicorn', '.', ''], 
37 | ###  'normalize': True, 
38 | ###  'embedding': array([-7.04746041e-03, -1.40993064e-02,  9.68784280e-03,  2.14456767e-03,
39 | ###        -3.47816362e-03,  1.25515764e-03, -2.40275939e-03,  3.57235223e-03,
40 | ###        ...
41 | ###        -1.16933743e-02, -5.76630747e-03,  9.58391698e-04,  1.34846689e-02],
42 | ###       dtype=float32)}
43 | 
44 | 
45 | masked_text = "CredAvenue is <mask>’s fastest Fintech unicorn."
46 | print(ybert.roberta_fill_in_the_blank_task(masked_text))
47 | ### [('credavenue is india’s fastest fintech unicorn.', 0.73644, ' india'),
48 | ### ('credavenue is world’s fastest fintech unicorn.', 0.17053, ' world'), 
49 | ### ('credavenue is america’s fastest fintech unicorn.', 0.03784, ' america'), 
50 | ### ('credavenue is asia’s fastest fintech unicorn.', 0.01167, ' asia'), 
51 | ### ('credavenue is australia’s fastest fintech unicorn.', 0.00319, ' australia'), 
52 | ### ('credavenue is europe’s fastest fintech unicorn.', 0.00307, ' europe'), 
53 | ### ('credavenue is china’s fastest fintech unicorn.', 0.00305, ' china'), 
54 | ### ('credavenue is singapore’s fastest fintech unicorn.', 0.00193, ' singapore'), 
55 | ### ('credavenue is canada’s fastest fintech unicorn.', 0.00161, ' canada'), 
56 | ### ('credavenue is britain’s fastest fintech unicorn.', 0.00148, ' britain')]
57 | 
58 | ```
59 | 
60 | ## How to finetune for classification and use example
61 | 
62 | Please check these two files for more details [finetuning shell script](./finetune_yubibert_classification_example.sh) and [sample python code](./finetune_yubibert_classification_example.py)
63 | 
64 | 


--------------------------------------------------------------------------------
/yubiai/nlp/yubiEmbeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/nlp/yubiEmbeddings/__init__.py


--------------------------------------------------------------------------------
/yubiai/nlp/yubiEmbeddings/finetune_yubibert_classification_example.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### Author : Swapnil Ashok Jadhav (github:swapaj)
 3 | ### Created Date : 28th Oct. 2022
 4 | ###
 5 | 
 6 | from fairseq.models.roberta import RobertaModel
 7 | import numpy as np
 8 | import re
 9 | 
10 | model = RobertaModel.from_pretrained("/path/to/dir/finetuned_model_files", 
11 |                                      checkpoint_file="checkpoint_best.pt", 
12 |                                      data_name_or_path="./bin_data", 
13 |                                      bpe="sentencepiece")
14 | model.eval()
15 | ### If gpu is available uncomment below line
16 | ### model.cuda() 
17 | 
18 | label_fn = lambda label: model.task.label_dictionary.string([label + model.task.label_dictionary.nspecial])
19 | 
20 | def get_results(input_text, clf_header_name="clf_head_name"):
21 |     """
22 |     Generic classfication function .. which returns classes with scores. 
23 |     Higher the score is better.
24 |     """
25 |     input_text = re.sub("\s+", " ", input_text)
26 |     input_text = input_text.lower().strip()
27 |     tokens = model.encode(input_text)
28 |     outprob = model.predict(clf_header_name, tokens[:510]).tolist()[0]
29 |     min_prob = np.min(outprob)
30 |     outprob_norm = [x-min_prob for x in outprob]
31 |     sum_prob = np.sum(outprob_norm)
32 |     outprob_norm = [x*1.0/sum_prob for x in outprob_norm]
33 |     outclass = [label_fn(i) for i in range(len(outprob))]
34 | 
35 |     outmap_norm = dict(zip(outclass, outprob_norm))
36 |     outmap_norm = {k: v for k, v in sorted(outmap_norm.items(), key=lambda item: item[1], reverse=True)}
37 |     outmap_og = dict(zip(outclass, outprob))
38 |     outmap_og = {k: v for k, v in sorted(outmap_og.items(), key=lambda item: item[1], reverse=True)}
39 |     
40 |     return outmap_norm, outmap_og


--------------------------------------------------------------------------------
/yubiai/nlp/yubiEmbeddings/finetune_yubibert_classification_example.sh:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Swapnil Ashok Jadhav (github:swapaj)
  3 | ### Created Date : 28th Oct. 2022
  4 | ###
  5 | 
  6 | #### Need at-least 1 gpu to fine-tune faster.
  7 | #### Python >=3.7 and nvidia-cuda-drivers + pytorch >= 1.8 installed 
  8 | #### (using aws deep learning ami is better)
  9 | #### Copy train.txt, train.label and valid.txt, valid.label to the same folder
 10 | ####     ".txt" file contains text per line .. lowercased, no new-line characters
 11 | ####     ".label" file contains label per line .. no whitespace
 12 | 
 13 | ##########################################
 14 | #### Install required python packages ####
 15 | ##########################################
 16 | pip install "sentencepiece>=0.1.97" "fairseq==0.12.2"
 17 | 
 18 | ##########################################################################################
 19 | #### Get files from yubibert model zip                                                ####
 20 | #### For encoder-8 model use "yubibert_e8_small.zip"                                  ####
 21 | #### For encoder-4 model use "yubibert_e4_micro.zip"                                  ####
 22 | #### Download models from here ->                                                     ####
 23 | ####         https://drive.google.com/drive/folders/1JteTr9GWezVIcRJd8TJ5uOnPVRznjS7o ####
 24 | ##########################################################################################
 25 | 
 26 | unzip yubibert_e4_micro.zip
 27 | cp yubibert_e4_micro/sentencepiece* .
 28 | cp yubibert_e4_micro/bin_data/dict.txt .
 29 | cp yubibert_e4_micro/checkpoint_best.pt .
 30 | mv checkpoint_best.pt og_yubibert_e4_micro.pt
 31 | rm -rf yubibert_e4_micro*
 32 | 
 33 | ##############################################
 34 | #### Make foldr for raw data and binaries ####
 35 | ##############################################
 36 | mkdir raw_data
 37 | mkdir bin_data
 38 | cp train.label raw_data/
 39 | cp valid.label raw_data/
 40 | 
 41 | ##################################################
 42 | #### Install sentencepiece command line tools ####
 43 | ##################################################
 44 | git clone https://github.com/google/sentencepiece.git 
 45 | cd sentencepiece
 46 | mkdir build
 47 | cd build
 48 | cmake ..
 49 | make -j $(nproc)
 50 | sudo make install
 51 | sudo ldconfig -v
 52 | cd ../..
 53 | 
 54 | ##########################################################################
 55 | #### Convert input text into tokenised text using sentencepiece model ####
 56 | ##########################################################################
 57 | spm_encode --model=sentencepiece.bpe.model --output_format=piece < train.txt > raw_data/train.txt.bpe
 58 | spm_encode --model=sentencepiece.bpe.model --output_format=piece < valid.txt > raw_data/valid.txt.bpe
 59 | 
 60 | ###########################################
 61 | #### Create fairseq preprocessed files ####
 62 | ###########################################
 63 | fairseq-preprocess --only-source --trainpref raw_data/train.label --validpref raw_data/valid.label --destdir bin_data/label/ --workers 4
 64 | fairseq-preprocess --only-source --trainpref raw_data/train.txt.bpe --validpref raw_data/valid.txt.bpe --destdir bin_data/input0 --workers 4 --srcdict dict.txt
 65 | 
 66 | ###############################################################
 67 | #### Set env variables (change according to your settings) ####
 68 | ###############################################################
 69 | TOTAL_NUM_UPDATES=100000
 70 | WARMUP_UPDATES=1000
 71 | LR=1e-05
 72 | HEAD_NAME="clf_head_name" ## Classification head name, give some uniq name. To be used in code later.
 73 | NUM_CLASSES=2 ## No of classes 
 74 | MAX_SENTENCES=8
 75 | ROBERTA_PATH=og_yubibert_e4_micro.pt
 76 | 
 77 | ########################################################################
 78 | #### Finetune yubibert_e4_micro (change according to your settings) ####
 79 | ########################################################################
 80 | CUDA_VISIBLE_DEVICES=0 fairseq-train ./bin_data/ --restore-file $ROBERTA_PATH \ 
 81 | --max-positions 512 --max-sentences $MAX_SENTENCES  --max-tokens 16384 \ 
 82 | --task sentence_prediction --reset-optimizer --reset-dataloader --reset-meters \ 
 83 | --required-batch-size-multiple 1 --init-token 0 --separator-token 2 \ 
 84 | --encoder-layers 4 --encoder-embed-dim 512 --encoder-ffn-embed-dim 2048 \ 
 85 | --encoder-attention-heads 16 --arch roberta_base --criterion sentence_prediction \ 
 86 | --classification-head-name $HEAD_NAME --num-classes $NUM_CLASSES --dropout 0.1 \ 
 87 | --attention-dropout 0.1 --weight-decay 0.1 --optimizer adam \ 
 88 | --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 --clip-norm 0.0 --lr-scheduler \ 
 89 | polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates \ 
 90 | $WARMUP_UPDATES --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 \ 
 91 | --fp16-scale-window 128 --max-epoch 8 --best-checkpoint-metric accuracy \ 
 92 | --maximize-best-checkpoint-metric --find-unused-parameters  --update-freq 2 \ 
 93 | --skip-invalid-size-inputs-valid-test
 94 | 
 95 | ########################################################################
 96 | #### Finetune yubibert_e8_small (change according to your settings) ####
 97 | ########################################################################
 98 | CUDA_VISIBLE_DEVICES=0 fairseq-train ./bin_data/ --restore-file $ROBERTA_PATH \ 
 99 | --max-positions 512 --max-sentences $MAX_SENTENCES  --max-tokens 16384 \ 
100 | --task sentence_prediction --reset-optimizer --reset-dataloader --reset-meters \ 
101 | --required-batch-size-multiple 1 --init-token 0 --separator-token 2 \ 
102 | --encoder-layers 8 --arch roberta_base --criterion sentence_prediction \ 
103 | --classification-head-name $HEAD_NAME --num-classes $NUM_CLASSES --dropout 0.1 \ 
104 | --attention-dropout 0.1 --weight-decay 0.1 --optimizer adam \ 
105 | --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 --clip-norm 0.0 --lr-scheduler \ 
106 | polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates \ 
107 | $WARMUP_UPDATES --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 \ 
108 | --fp16-scale-window 128 --max-epoch 8 --best-checkpoint-metric accuracy \ 
109 | --maximize-best-checkpoint-metric --find-unused-parameters  --update-freq 2 \ 
110 | --skip-invalid-size-inputs-valid-test
111 | 
112 | #######################################################
113 | #### Create final output folder to be used in code ####
114 | #######################################################
115 | mkdir finetuned_model_files
116 | mv sentencepiece* finetuned_model_files/
117 | mv checkpoints/checkpoint_best.pt finetuned_model_files/
118 | mv bin_data finetuned_model_files/ 
119 | rm -f finetuned_model_files/bin_data/*/train.*
120 | 
121 | ################################################################################################################
122 | ### Now you can pass folder path of "finetuned_model_files" to "finetune_yubibert_classification_example.py" ###
123 | ################################################################################################################
124 | 


--------------------------------------------------------------------------------
/yubiai/nlp/yubiEmbeddings/yubibert.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Swapnil Ashok Jadhav (github:swapaj)
  3 | ### Created Date : 28th Oct. 2022
  4 | ### Credit for the training library: https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.md
  5 | ### 
  6 | 
  7 | 
  8 | from yubiai import set_model_info,  verify_model_path
  9 | import os
 10 | from fairseq.models.roberta import RobertaModel
 11 | import numpy as np
 12 | import re
 13 | import torch
 14 | 
 15 | 
 16 | class YubiBERT:
 17 |     def __init__(self, use_gpu=False, model_type="yubibert_e4_micro"):
 18 |         self.use_gpu = use_gpu
 19 | 
 20 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(model_type)
 21 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
 22 |         self.model = self.load_model()
 23 | 
 24 |     def load_model(self):
 25 |         """
 26 |         Load model from default path
 27 |         """
 28 |         model = RobertaModel.from_pretrained(self.model_folder_path, checkpoint_file="checkpoint_best.pt", data_name_or_path="./bin_data", bpe="sentencepiece")
 29 |         model.eval()
 30 |         if self.use_gpu == True:
 31 |             model.cuda()
 32 |         return model
 33 | 
 34 |     def getEmbeddings(self, text, normalize=True):
 35 |         """
 36 |         Returns vector of size 512
 37 |         If you set normalize to True .. it will return unit normalized vector else the the raw vector
 38 |         Normalize set to True is recommended
 39 |         """
 40 |         token_outmap = self.getTokens(text)
 41 |         tokens = token_outmap['encoded_tokens']
 42 |         if self.use_gpu == True:
 43 |             last_layer_features = self.model.extract_features(tokens[:511]).detach().cpu().numpy()
 44 |         else:
 45 |             last_layer_features = self.model.extract_features(tokens[:511]).detach().numpy()
 46 |         avg_feature = np.mean(last_layer_features[0], axis=0)
 47 |         if normalize is True:
 48 |             avg_feature = avg_feature / np.linalg.norm(avg_feature)
 49 |         outmap = {}
 50 |         outmap['encoded_tokens'] = tokens
 51 |         outmap['decoded_tokens'] = token_outmap['decoded_tokens']
 52 |         outmap['normalize'] = normalize
 53 |         outmap['embedding'] = avg_feature
 54 |         return outmap
 55 | 
 56 |     def getEmbeddings_last_n_layers(self, text, last_n_layers=1):
 57 |         """
 58 |         Returns vector of size 512*n for last n layers .. concatenated
 59 |         Here layers mean .. embedding layer after each transformer/encoder block
 60 |         In many SOTA Glue tasks Google has used last 4 layers concatenated.
 61 |         """
 62 |         token_outmap = self.getTokens(text)
 63 |         tokens = token_outmap['encoded_tokens']
 64 |         all_layers = self.model.extract_features(tokens, return_all_hiddens=True)
 65 |         nlayer_vector_map = {}
 66 |         concatenated_vector = []
 67 |         for i in range(0, last_n_layers*-1, -1):
 68 |             if self.use_gpu == True:
 69 |                 avg_feature = np.mean(all_layers[i-1].detach().cpu().numpy()[0], axis=0)
 70 |             else:
 71 |                 avg_feature = np.mean(all_layers[i-1].detach().numpy()[0], axis=0)
 72 |             nlayer_vector_map[i-1] = avg_feature
 73 |             concatenated_vector = concatenated_vector + list(avg_feature)
 74 |         outmap = {}
 75 |         outmap['encoded_tokens'] = tokens
 76 |         outmap['decoded_tokens'] = token_outmap['decoded_tokens']
 77 |         outmap['nlayer_vector_map'] = nlayer_vector_map
 78 |         outmap['concatenated_vector'] = concatenated_vector
 79 |         return outmap
 80 | 
 81 |     def getTokens(self, text):
 82 |         """
 83 |         Tokenize given text and also decode individually to return text token slices
 84 |         """
 85 |         text = re.sub("\s+", " ", text.lower()).strip()
 86 |         etokens = self.model.encode(text)
 87 |         dtokens = [self.model.decode(torch.as_tensor(np.array([tkn]))) for tkn in etokens]
 88 |         outmap = {"encoded_tokens": etokens, "decoded_tokens": dtokens}
 89 |         return outmap
 90 | 
 91 |     def roberta_fill_in_the_blank_task(self, text, topk=10):
 92 |         """
 93 |         Given a <mask> to fill in .. returns output array with possible replacable words and their scores.
 94 |         Please add only one entry of <mask>
 95 |         """
 96 |         text = re.sub("\s+", " ", text.lower()).strip()
 97 |         if "<mask>" in text:
 98 |             return self.model.fill_mask(text, topk=topk)
 99 |         return []
100 | 
101 | 


--------------------------------------------------------------------------------
/yubiai/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/__init__.py


--------------------------------------------------------------------------------
/yubiai/vision/document_image_detection/README.md:
--------------------------------------------------------------------------------
 1 | # Document Image Class Detection
 2 | 
 3 | ## Document vs Non-Documents Classification
 4 | * Model to detect if given image is document or not
 5 | * We used ~30k document images and ~50k different kinds of images
 6 | * Various models used and currently two models are provided
 7 |     * `doc-vs-nondoc_Xception_block_12-14`
 8 |         * We finetuned Xception model last 3 block
 9 |     * `doc-vs-nondoc_vit-b16_layer15`
10 |         * We finetuned vit model last 2 transformer blocks
11 | * Performance of both models was good on validation set.
12 | 
13 |     | model | Validation Accuracy |
14 |     | ----- | ------------------- | 
15 |     | Xception_12-14 | 99.8% |
16 |     | VIT-b16_layer16 | 99.6% |
17 | 
18 | 
19 | ## NSFW Image Classification
20 | * Model to detect if given image is document or Safe vs Non-Safe image
21 | * We used >1million images totally with classes `'anime', 'docimage', 'explicit', 'gore_violence', 'non-explicit', 'sexy'`
22 | * Various models used and currently two models are provided
23 |     * `nsfw_detection_ResNet101V2`
24 |         * We finetuned with all layers open for training
25 |     * `nsfw_detection_Xception_block_12-14`
26 |         * We finetuned Xception model last 3 block
27 |     * `nsfw_detection_Xception_block_13-14`
28 |         * We finetuned Xception model last 2 block
29 |     * `nsfw_detection_vit-b16_layer16`
30 |         * We finetuned vit model last 2 transformer blocks
31 | * Performance of all models captured on validation set.
32 | 
33 |     | model | Validation Accuracy |
34 |     | ----- | ------------------- | 
35 |     | ResNet101V2 | 79% |
36 |     | Xception_block_12-14 | 85% |
37 |     | Xception_block_13-14 | 86% |
38 |     | VIT-b16_layer16 | 83% |
39 | 
40 | 
41 | ## How to run
42 | 
43 | ```python
44 | 
45 | from yubiai.vision.document_image_detection.image_classification import DocDetection, NSFWDetection
46 | 
47 | docimgpath  = "/path/of/an/test/docimage.jpg"
48 | nondocimgpath  = "/path/of/an/test/non-docimage.jpg"
49 | 
50 | docmodel = DocDetection(model_type="doc-vs-nondoc_Xception_block_12-14", use_gpu=False)
51 | nsfwmodel = NSFWDetection(model_type="nsfw_detection_Xception_block_12-14", use_gpu=False)
52 | 
53 | docmodel.classify(docimgpath)
54 | ### Output : {'document': 100.0, 'nondocument': 0.0}
55 | nsfwmodel.classify(docimgpath)
56 | ### Output : {'anime': 0.0, 'docimage': 99.98, 'explicit': 0.0, 'gore_violence': 0.0, 'non-explicit': 0.01, 'sexy': 0.0} 
57 | 
58 | docmodel.classify(nondocimgpath)
59 | ### Output : {'document': 0.0, 'nondocument': 100.0}
60 | nsfwmodel.classify(nondocimgpath)
61 | ### Output : {'anime': 0.0, 'docimage': 0.0, 'explicit': 0.06, 'gore_violence': 0.0, 'non-explicit': 62.18, 'sexy': 37.75} 
62 | 
63 | ```


--------------------------------------------------------------------------------
/yubiai/vision/document_image_detection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_image_detection/__init__.py


--------------------------------------------------------------------------------
/yubiai/vision/document_image_detection/image_classification.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Swapnil Ashok Jadhav
  3 | ### Created Date : 23 April 2023
  4 | ### Project Information
  5 | ###     Two variations of models are prepared
  6 | ###     Type 1 : Document vs NonDocument classifier
  7 | ###     Type 2 : NSFW classifiers with 6 class support 
  8 | ###              'anime', 'docimage', 'explicit', 'gore_violence', 'non-explicit', 'sexy'
  9 | ###
 10 | 
 11 | from yubiai import set_model_info,  verify_model_path
 12 | import os, random
 13 | from tensorflow.keras.models import load_model
 14 | from yubiai.vision.utility.preprocess import image_preprocessing
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | import PIL.Image
 18 | from scipy import ndimage
 19 | from PIL import Image
 20 | from PIL import ImageOps
 21 | 
 22 | class DocDetection:
 23 |     def __init__(self, model_type="doc-vs-nondoc_Xception_block_12-14", use_gpu=False):
 24 |         ### Mention all default model variables
 25 |         self.use_gpu = use_gpu
 26 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(model_type)
 27 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
 28 | 
 29 |         self.model_folder_name = model_type
 30 |         if model_type == "doc-vs-nondoc_Xception_block_12-14":
 31 |             self.imgsize = 299
 32 |         elif model_type == "doc-vs-nondoc_vit-b16_layer15":
 33 |             self.imgsize = 384
 34 |         else:
 35 |             self.imgsize = 299
 36 | 
 37 |         self.model = self.load_model(self.model_folder_path)
 38 |         self.classes = ['document', 'nondocument']
 39 | 
 40 |     def load_model(self, model_path):
 41 |         """
 42 |         Load model from given path
 43 |         """
 44 |         model = load_model(model_path)
 45 |         return model
 46 | 
 47 |     def classify(self, image_path):
 48 |         """
 49 |         Classify images in documents vs non-documents
 50 |         """
 51 |         image = np.array(Image.open(image_path))
 52 |         val_dataset = tf.data.Dataset.from_tensor_slices([image]).batch(1)
 53 |         reshape_layer = tf.keras.layers.experimental.preprocessing.Resizing(self.imgsize, self.imgsize)
 54 |         norm_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1/255.)
 55 |         norm_val_dataset = val_dataset.map(lambda x: (norm_layer(reshape_layer(x))))
 56 |         output = [float("%4.2f"% v) for v in self.model.predict(norm_val_dataset)[0]*100.0]
 57 |         result = dict(zip(self.classes, output))
 58 |         return result
 59 | 
 60 | 
 61 | class NSFWDetection:
 62 |     def __init__(self, model_type="nsfw_detection_Xception_block_12-14", use_gpu=False):
 63 |         ### Mention all default model variables
 64 |         self.use_gpu = use_gpu
 65 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(model_type)
 66 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
 67 |         
 68 |         self.model_folder_name = model_type
 69 |         self.imgsize = 299
 70 |         if model_type == "nsfw_detection_ResNet101V2":
 71 |             self.imgsize = 224
 72 |         elif model_type == "nsfw_detection_Xception_block_12-14":
 73 |             self.imgsize == 299
 74 |         elif model_type == "nsfw_detection_Xception_block_13-14":
 75 |             self.imgsize = 299
 76 |         elif model_type == "nsfw_detection_vit-b16_layer16":
 77 |             self.imgsize = 384
 78 |         self.model = self.load_model(self.model_folder_path)
 79 |         self.classes = ['anime', 'docimage', 'explicit', 'gore_violence', 'non-explicit', 'sexy']
 80 | 
 81 |     def load_model(self, model_path):
 82 |         """
 83 |         Load model from given path
 84 |         """
 85 |         model = load_model(model_path)
 86 |         return model
 87 | 
 88 |     def classify(self, image_path):
 89 |         """
 90 |         Classify images in documents vs non-documents
 91 |         """
 92 |         image = np.array(Image.open(image_path))
 93 |         val_dataset = tf.data.Dataset.from_tensor_slices([image]).batch(1)
 94 |         reshape_layer = tf.keras.layers.experimental.preprocessing.Resizing(self.imgsize, self.imgsize)
 95 |         norm_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1/255.)
 96 |         norm_val_dataset = val_dataset.map(lambda x: (norm_layer(reshape_layer(x))))
 97 |         output = [float("%4.2f"% v) for v in self.model.predict(norm_val_dataset)[0]*100.0]
 98 |         result = dict(zip(self.classes, output))
 99 |         return result
100 | 


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/README.md:
--------------------------------------------------------------------------------
 1 | # Yubi's Fintech Document Segmentation Models
 2 | We want to segment or find segments in given document image so that we can understand and process document better.
 3 | 
 4 | ## About
 5 | * We used tagged ~700 images in v1 and ~6k images in v1_aug version with augmentations
 6 | * In v2 we used >12k images with augmentation.
 7 | * Random Augmentations used are - `bright`, `daylight`, `detailenhance1`, `detailenhance2`, `sepia`, `summer`, `winter`, `invert`
 8 | * Used Faster-RCNNversion using simple library `detecto`
 9 | * Classes used are - `'page layout', 'content box', 'table', 'handwritten', 'company logo', 'watermark', 'signature', 'stamp', 'stamp-text', 'stamp-circular', 'stamp-date', 'stamp-sign', 'profile image', 'bar code', 'qr code', 'clickable', 'Thumb print', 'clipart', 'visually different objects', 'virtual table', 'charts', 'advertisement'`
10 | * `prob_threshold` of `0.3` seems to be better from initial observations.
11 | * Three versions of models are available for now
12 |     * `yubi_document_segmentation_v1` : Trained with ~700 images, 50 epochs, 0.001 learning rate
13 |     * `yubi_document_segmentation_with_aug_v1` : Trained with ~6000 images, 50 epochs, 0.001 learning rate
14 |     * `yubi_document_segmentation_v2` : Trained with ~1600 images (12k augmentations), 25 epochs, 0.002 learning rate,
15 | * Runs on gpu and cpu both. On cpu speed is ~0.5-0.7 seconds depending on image size.
16 | * There is no need of image preprocessing or resizing. Works with all kinds of image color, sizes, noises.
17 | * Data labels are skewed. TextContent label is occuring >8k times and some are even <10 times. For them mAPE did not improve. Need more tag for such classes to improve further.
18 | * Results for `yubi_document_segmentation_v2` model with `0.3` threshold on holdout ~700 images.
19 |     <p align="left"><img src="detection-results-info.png" width="400" height="300"></p>
20 |     <p align="left"><img src="ground-truth-info.png" width="400" height="300"></p>
21 |     <p align="left"><img src="lamr.png" width="400" height="300"></p>
22 |     <p align="left"><img src="mAP.png" width="400" height="300"></p>
23 | 
24 | ## How to run
25 | 
26 | ```python
27 | 
28 | from yubiai.vision.document_segmentation.segment_doc import YubiDocumentSegmentDetection
29 | model = YubiDocumentSegmentDetection(segment_model="yubi_document_segmentation_v1")
30 | 
31 | imgpath = 'input_image_path'
32 | model.detect_segments(imgpath, prob_threshold=0.3, export_image_with_tags=True, export_image_path="/some/output/folder/tmp.jpg")
33 | 
34 | #### Output Sample ####
35 | # {'seg1': {'label_id': '0',
36 | #   'label_name': 'page layout',
37 | #   'box': [22.94194221496582,
38 | #    29.284299850463867,
39 | #    1690.063232421875,
40 | #    2325.850830078125],
41 | #   'probability': 0.9987373948097229},
42 | #  'seg2': {'label_id': '9',
43 | #   'label_name': 'stamp-circular',
44 | #   'box': [492.64739990234375,
45 | #    830.1820068359375,
46 | #    908.0197143554688,
47 | #    1249.171142578125],
48 | #   'probability': 0.9969298243522644}, 
49 | # .....
50 | # .....
51 | # }
52 | 
53 | ```


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_segmentation/__init__.py


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/detection-results-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_segmentation/detection-results-info.png


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/ground-truth-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_segmentation/ground-truth-info.png


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/lamr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_segmentation/lamr.png


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/mAP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/document_segmentation/mAP.png


--------------------------------------------------------------------------------
/yubiai/vision/document_segmentation/segment_doc.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | ### Author : Swapnil Ashok Jadhav
 3 | ### Created Date : 28 March 2023
 4 | ### Project Information
 5 | ###     Find segments with probable tags associated. 
 6 | ###     Used faster-rcnn version of implementation for quick exploration using detecto.
 7 | ###     Currently trained with ~700 original images and ~5k augmented images.
 8 | ###
 9 | 
10 | 
11 | from yubiai import set_model_info,  verify_model_path
12 | from detecto import core, utils
13 | import os, json, cv2
14 | 
15 | 
16 | class YubiDocumentSegmentDetection:
17 |     def __init__(self, segment_model="yubi_document_segmentation_v1", use_gpu=False):
18 |         self.use_gpu = use_gpu
19 |         self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name = set_model_info(segment_model)
20 |         verify_model_path(self.model_folder_path, self.model_folder_name, self.model_zip_path, self.model_zip_name)
21 |         self.model, self.label_id_map = self.load_model(self.model_folder_path)
22 | 
23 |     def load_model(self, model_path):
24 |         model_checkpoint_path = "%s/model.pth" % model_path
25 |         label_id_map_path = "%s/label_id_map.json" % model_path
26 |         label_id_map = json.load(open(label_id_map_path,'r'))
27 |         model = core.Model.load(model_checkpoint_path, list(label_id_map.keys()))
28 |         return model, label_id_map
29 | 
30 |     def detect_segments(self, imgpath, prob_threshold=0.0, export_image_with_tags=False, export_image_path=""):
31 |         preds = {}
32 |         image = utils.read_image(imgpath)
33 |         predictions = self.model.predict(image)
34 |         labels, boxes, scores = predictions
35 |         boxes = boxes.tolist()
36 |         scores = scores.tolist()
37 |         for i in range(len(labels)):
38 |             if scores[i] >= prob_threshold:
39 |                 preds['seg%d'%(i+1)] = {"label_id":labels[i], "label_name":self.label_id_map[labels[i]], 
40 |                                                 "box": boxes[i], "probability": scores[i]}
41 |         if export_image_with_tags is True:
42 |             image = cv2.imread(imgpath)
43 |             for k,v in preds.items():
44 |                 label = v['label_name']
45 |                 xmin,ymin,xmax,ymax = v['box']
46 |                 prob = v['probability']
47 |                 if prob > prob_threshold:
48 |                     color = (31,31,255)
49 |                     image = cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
50 |                     boxtext = "%s : %s (%.2f)" % (k, label, prob)
51 |                     cv2.putText(image, boxtext, (int(xmin), int(ymin)), cv2.FONT_HERSHEY_COMPLEX, 0.75, color, 2)
52 |             cv2.imwrite(export_image_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
53 |         return preds
54 |     
55 |     


--------------------------------------------------------------------------------
/yubiai/vision/skew_detection/README.md:
--------------------------------------------------------------------------------
  1 | # Document Skew Detection
  2 | 
  3 | ## Quadrant Detection
  4 | * We trained model to detect if skewed angle lies in `Q1` or `Q2` or `Q3` or `Q4`
  5 | * Millions of skewed images used to train the model
  6 | * `ResNet101v2` was used with some extra layers at the end. Fine-tuned with all layers open.
  7 | * Took 3+ days on `g5dn.12xlarge` instance for each run.
  8 | * Two models are trained
  9 |     * `Quad4Detection_ResNet101V2_0-90`
 10 |         * Q1 -> 0-90 degree angles
 11 |         * Q2 -> 90-180 degree angles
 12 |         * Q3 -> 180-270 degree angles
 13 |         * Q4 -> 270-360 degree angles
 14 |     * `Quad4Detection_ResNet101V2_45-135`
 15 |         * Q1 -> 45-135 degree angles
 16 |         * Q2 -> 135-225 degree angles
 17 |         * Q3 -> 225-315 degree angles
 18 |         * Q4 -> 315-360 & 0-45 degree angles
 19 | * Accuracy on validation test was ~96-97% 
 20 | * Accuracy improves when we take >5 random patches of original images and take votes
 21 | * 23k images test set `classification report`
 22 | 
 23 |     | Quadrant | precision | recall | f1-score | support |
 24 |     | -------- | --------- | ------ | -------- | ------- |
 25 |     | Q1 | 0.99 | 0.97 | 0.98 | 5524 | 
 26 |     | Q2  | 0.98 | 0.98 | 0.98 | 5408 | 
 27 |     | Q3  | 0.98 | 0.98 | 0.98 | 5459 | 
 28 |     | Q4  | 0.97 | 0.99 | 0.98 | 5465 |
 29 |     | accuracy | | | 0.98 | 21856 |
 30 |     | macro avg | 0.98 | 0.98 | 0.98 | 21856 | 
 31 |     | weighted avg | 0.98 | 0.98 | 0.98 | 21856 | 
 32 | 
 33 | ## Angle Detection
 34 | * We trained model to detect if skewed angle lies in `Q1` i.e. return value between 0 and 90 degrees
 35 | * Quadrant should corrected before this step.
 36 | * Millions of skewed images used to train the model
 37 | * `ResNet101v2` was used with some extra layers at the end. Fine-tuned with all layers open.
 38 | * Took 3+ days on `g5dn.12xlarge` instance for each run.
 39 | * MAE was ~1 degree angle when we stopped the training.
 40 | * Two models are trained
 41 |     * `SkewDetection_ResNet101V2_0-90` : Detects angle between 0 to 90 
 42 |     * `SkewDetection_ResNet101V2_45-135` : Detects angle between -45 to 45
 43 | * For better and easier inference `0-90` models for both quad and skew should be run together. 
 44 | * Similarly `45-135` models should be run together.
 45 | * Our model performance
 46 |     * On 23k Set `MAE - 5.35` and `RMSE - 7.47` for our model 
 47 |     * Skewness from 0 to 360 degrees converted to 0-90 degrees with `Quadrant Detection` model
 48 | * Comparison with [jdeskew model](https://github.com/phamquiluan/jdeskew)
 49 |     * `jdeskew` works for skew from -45 to 45 so reduced the testset to ~7.4k images with this restriction
 50 |     * Our model works better far away from 0 degree. `jdeskew` works well near 0 degree and worse far from it. Strange behaviours and we hope to explore more.
 51 |     * Saying that ... our model works for 0-90 degree and with quadrant detection actually works for 0-360 degrees.
 52 | 
 53 |     | model | RMSE | MAE |
 54 |     | ----- | ---- | --- | 
 55 |     | jdeskew | 23.91 | 16.72 |
 56 |     | yubi-skew-detection | 22.16 | 13.31 |
 57 | 
 58 | ## How to run
 59 | 
 60 | ```python
 61 | 
 62 | from yubiai.vision.skew_detection.document_skew_detection import YubiDocSkewDetector
 63 | 
 64 | skew_detector = YubiDocSkewDetector(qudrant_model="Quad4Detection_ResNet101V2_0-90", 
 65 |                                     skew_model="SkewDetection_ResNet101V2_0-90", 
 66 |                                     use_gpu=False)
 67 | imagepath = "/home/ubuntu/some_image.jpeg"
 68 | 
 69 | ### To detect the Quadrant
 70 | skew_detector.predict_qudrant(imagepath)
 71 | # 1/1 [==============================] - 1s 1s/step
 72 | # {'detected_quadrant': 'Q4', 'quadrant_prob': 100.0}
 73 | 
 74 | ### To correct the image quadrant to Q1
 75 | skew_detector.rotate_to_first_qudrant(imagepath, 'Q4', saveimage=True)
 76 | # array([[[192,  27,  34],
 77 | #         [192,  27,  34],
 78 | #         [187,  29,  34],
 79 | #         ...,
 80 | #         [138,  84,  52],
 81 | #         [150,  96,  64],
 82 | #         [158, 104,  72]]], dtype=uint8)
 83 | 
 84 | ### Detect the angle/skewness of given image
 85 | skew_detector.predict_angle(imagepath)
 86 | # 1/1 [==============================] - 1s 1s/step
 87 | # {'average_angle': 84.52632, 'angle_preds': [84.57919, 84.667244, 84.79881, 84.62079, 84.37254, 84.31932, 84.61099, 83.88075, 84.6366, 84.77704]}
 88 | 
 89 | ### Correct the angle of given image
 90 | skew_detector.rotate_to_correct_angle(imagepath, 84.52632, saveimage=True)
 91 | # array([[[255, 255, 255],
 92 | #         [255, 255, 255],
 93 | #         [255, 255, 255],
 94 | #         ...,
 95 | #         [255, 255, 255],
 96 | #         [255, 255, 255],
 97 | #         [255, 255, 255]]], dtype=uint8)
 98 | 
 99 | 
100 | ### Sample Code Given as end-2-end solution
101 | skew_detector.correct_image_skew_sample_code(imagepath)
102 | 
103 | ```


--------------------------------------------------------------------------------
/yubiai/vision/skew_detection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/skew_detection/__init__.py


--------------------------------------------------------------------------------
/yubiai/vision/skew_detection/document_skew_detection.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Swapnil Ashok Jadhav
  3 | ### Created Date : 1 Feb 2023
  4 | ### Project Information
  5 | ###     Two variations of models are prepared
  6 | ###     Type 1 : Qudrant Detection (0-90-180-270-0 and 45-135-225-315-45)
  7 | ###     Type 2 : Skew Angle Detection (0 to 90 float numbers)
  8 | ###
  9 | 
 10 | from yubiai import set_model_info,  verify_model_path
 11 | import os, random
 12 | import PIL.Image
 13 | from tensorflow.keras.models import load_model
 14 | from yubiai.vision.utility.preprocess import image_preprocessing
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from scipy import ndimage
 18 | from PIL import Image
 19 | from PIL import ImageOps
 20 | 
 21 | 
 22 | class YubiDocSkewDetector:
 23 |     def __init__(self, qudrant_model="Quad4Detection_ResNet101V2_0-90", skew_model="SkewDetection_ResNet101V2_0-90", use_gpu=False):
 24 |         ### Mention all default model variables
 25 |         self.use_gpu = use_gpu
 26 |         self.preprocessing_obj = image_preprocessing()
 27 | 
 28 |         ### Quadrant Detection Model
 29 |         self.qudrant_model_folder_path, self.qudrant_model_folder_name, self.qudrant_model_zip_path, self.qudrant_model_zip_name = set_model_info(qudrant_model)
 30 |         verify_model_path(self.qudrant_model_folder_path, self.qudrant_model_folder_name, self.qudrant_model_zip_path, self.qudrant_model_zip_name)
 31 |         self.qudrant_model = self.load_yubi_model(self.qudrant_model_folder_path)
 32 | 
 33 |         ### Skew/Rotation Detection Model
 34 |         self.skew_model_folder_path, self.skew_model_folder_name, self.skew_model_zip_path, self.skew_model_zip_name = set_model_info(skew_model)
 35 |         verify_model_path(self.skew_model_folder_path, self.skew_model_folder_name, self.skew_model_zip_path, self.skew_model_zip_name)
 36 |         self.skew_model = self.load_yubi_model(self.skew_model_folder_path)
 37 | 
 38 |     def load_yubi_model(self, model_path):
 39 |         """
 40 |         Load model from given path
 41 |         """
 42 |         model = load_model(model_path)
 43 |         return model
 44 | 
 45 |     def correctH(self, img, min_size):
 46 |         """
 47 |         Keeping the aspect ratio same change height
 48 |         """
 49 |         x, y = img.size
 50 |         newx = max(x, min_size)
 51 |         wpercent = newx/float(x)
 52 |         newy = int(float(y)*float(wpercent))
 53 |         img = img.resize((newx,newy))
 54 |         return img
 55 |     
 56 |     def correctW(self, img, min_size):
 57 |         """
 58 |         Keeping the aspect ratio same change width
 59 |         """
 60 |         x, y = img.size
 61 |         newy = max(y, min_size)
 62 |         hpercent = newy/float(y)
 63 |         newx = int(float(x)*float(hpercent))
 64 |         img = img.resize((newx,newy))
 65 |         return img
 66 | 
 67 |     def correct_image_size(self, imagepath, resize=True):
 68 |         """
 69 |         Makes every image slightly bigger than 512x512 that is 520x520.
 70 |         So that crops can be taken out of it.
 71 |         """
 72 |         min_size = 515
 73 |         new_image = PIL.Image.open(imagepath)
 74 |         new_image = new_image.convert("RGB")
 75 |         if resize is True:
 76 |             new_image = self.correctH(new_image, min_size)
 77 |             new_image = self.correctW(new_image, min_size)
 78 |             new_image.save(imagepath)
 79 |         else:
 80 |             fill_color = (255, 255, 255, 0)
 81 |             x, y = new_image.size
 82 |             size = max(min_size, x, y)
 83 |             new_image = PIL.Image.new('RGB', (size, size), fill_color)
 84 |             new_image.paste(new_image, (int((size - x) / 2), int((size - y) / 2)))
 85 |             new_image.save(imagepath)
 86 | 
 87 |     def remove_extra_white_paddings(self, imagepath):
 88 |         """
 89 |         When we rotate the image we see that extra white space keeps getting added.
 90 |         Following code will remove that extra white padding.
 91 |         """
 92 |         image = Image.open(imagepath)
 93 |         image.load()
 94 |         invert_im = image.convert("RGB")
 95 |         invert_im = ImageOps.invert(invert_im)
 96 |         imageBox = invert_im.getbbox()
 97 |         cropped = image.crop(imageBox)
 98 |         cropped.save(imagepath)
 99 | 
100 |     def generate_dataset(self, imagepath, num_crops=10, batch_size=32):
101 |         """
102 |         From given image generate random crops and tensor dataset with batch-size
103 |         """
104 |         self.remove_extra_white_paddings(imagepath)
105 |         self.correct_image_size(imagepath)
106 |         rseed = random.randint(0,9999)
107 | 
108 |         ### Generated "num_crops" images with random crop strategy
109 |         output_image_gen = self.preprocessing_obj.image_generator(image_path=imagepath, multiple_rotated_images = False,
110 |                                                                   multiple_random_crops = True, cropped_image_height=512,
111 |                                                                   cropped_image_width=512, random_crop_white_percent = 70,
112 |                                                                   random_crop_white_increment = True, n_random = num_crops,
113 |                                                                   n_rotated = 0, rotate_random = False, seed = rseed)
114 |         crop_img_arr = []
115 |         for i in range(num_crops):
116 |             img, _ = next(output_image_gen)
117 |             crop_img_arr.append(img)
118 |         crop_img_arr = np.array(crop_img_arr)
119 | 
120 |         ### Create dataset with all above images to run in batches
121 |         val_dataset = tf.data.Dataset.from_tensor_slices(crop_img_arr).batch(batch_size)
122 |         reshape_layer = tf.keras.layers.experimental.preprocessing.Resizing(224, 224)
123 |         norm_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1/255.)
124 |         norm_val_dataset = val_dataset.map(lambda x: (norm_layer(reshape_layer(x))))
125 |         return norm_val_dataset
126 | 
127 |     def predict_qudrant(self, imagepath, num_crops=10, batch_size=32):
128 |         """
129 |         Given an image predicts the Quadrant it belongs to.
130 |         """
131 |         norm_val_dataset = self.generate_dataset(imagepath, num_crops=num_crops, batch_size=batch_size)
132 |         quad_predictions = self.qudrant_model.predict(norm_val_dataset)
133 | 
134 |         ### Get map of quadrants vs counts 
135 |         outmap = {"Q1":0, "Q2":0, "Q3":0, "Q4":0}
136 |         total = 0.0
137 |         for outarr in quad_predictions:
138 |             classindex = np.argmax(outarr)
139 |             classprob = outarr[classindex]
140 |             outmap["Q%d" % (classindex+1)] += classprob
141 |             total += classprob
142 |         outmap = [(k,v) for k, v in sorted(outmap.items(), key=lambda item: item[1], reverse=True)]
143 |         detected_quadrant = outmap[0][0]
144 |         quadrant_prob = outmap[0][1]*100.0/total
145 | 
146 |         results = {"detected_quadrant":detected_quadrant, "quadrant_prob":quadrant_prob}
147 |         return results
148 | 
149 |     def rotate_to_first_qudrant(self, imagepath, detected_quadrant="Q1", saveimage=True):
150 |         """
151 |         Correct the quadrant of given image to Q1
152 |         """
153 |         imgarr = np.asarray(PIL.Image.open(imagepath))
154 |         if detected_quadrant == "Q4": 
155 |             imgarr = ndimage.rotate(imgarr, 270, reshape=True, cval=255)
156 |         elif detected_quadrant == "Q3": 
157 |             imgarr = ndimage.rotate(imgarr, 180, reshape=True, cval=255)
158 |         elif detected_quadrant == "Q2": 
159 |             imgarr = ndimage.rotate(imgarr, 90, reshape=True, cval=255)
160 |         if saveimage is True:
161 |             imgobj = PIL.Image.fromarray(imgarr)
162 |             imgobj.save(imagepath)
163 |         return imgarr
164 | 
165 |     def predict_angle(self, imagepath, num_crops=10, batch_size=32):
166 |         """
167 |         Given an image predicts the Angle it is rotated. 
168 |         Either in -45 to 45  OR 0 to 90 depending upon the model chosen.
169 |         Quadrant has to be corrected before this method
170 |         """
171 |         norm_val_dataset = self.generate_dataset(imagepath, num_crops=num_crops, batch_size=batch_size)
172 |         angle_preds = [x[0] for x in self.skew_model.predict(norm_val_dataset)]
173 |         average_angle = np.mean(angle_preds)
174 |         median_angle = np.median(angle_preds)
175 |         results = {"average_angle": average_angle, "angle_preds": angle_preds, "median_angle":median_angle}
176 |         return results
177 | 
178 |     def rotate_to_correct_angle(self, imagepath, angle, saveimage=True):
179 |         """
180 |         Rotate and save given image with skew angle
181 |         """
182 |         imgarr = np.asarray(PIL.Image.open(imagepath))
183 |         if self.skew_model_folder_name == "SkewDetection_ResNet101V2_45-135":
184 |             angle = angle - 45
185 |         imgarr = ndimage.rotate(imgarr, angle, reshape=True, cval=255)
186 |         if saveimage is True:
187 |             imgobj = PIL.Image.fromarray(imgarr)
188 |             imgobj.save(imagepath)
189 |         return imgarr
190 | 
191 |     def correct_image_skew_sample_code(self, imagepath, num_crops=10, batch_size=32):
192 |         """
193 |         Sample code given which works end-2-end
194 |         Detects quadrant -> Corrects quadrant -> Predicts Angle -> Corrects Angle
195 |         User should analyse and change the code according to their need.
196 |         """
197 |         quad_results = self.predict_qudrant(imagepath, num_crops, batch_size)
198 |         print("quad_results : ", quad_results)
199 |         detected_quadrant = quad_results['detected_quadrant']
200 |         quad_correct_img = self.rotate_to_first_qudrant(imagepath, detected_quadrant, True)
201 |         skew_results = self.predict_angle(imagepath, num_crops, batch_size)
202 |         print("skew_results : ", skew_results)
203 |         skew_correct_img = self.rotate_to_correct_angle(imagepath, skew_results['median_angle'], True)
204 |         self.remove_extra_white_paddings(imagepath)
205 |         return detected_quadrant, skew_results['median_angle']
206 | 


--------------------------------------------------------------------------------
/yubiai/vision/utility/README.md:
--------------------------------------------------------------------------------
 1 | # Vision Utility Functions
 2 | 
 3 | ## Preprocessing
 4 | * Added functions which are used very repetetively in image preprocessing with lot of custom controls will be there in this module
 5 | for example - 
 6 |     * image rotation
 7 |     * image flip
 8 |     * image resize
 9 |     * image random crop
10 | * This will be a single wrapper for all the preprocessing functions
11 | 
12 | ## Example code
13 | 
14 | Output will be numpy array .. which can be loaded to see the image.
15 | 
16 | ```python
17 | 
18 | from yubiai.vision.utility.preprocess import image_preprocessing
19 | 
20 | preprocessing_object = image_preprocessing()
21 | image_path = "/home/ubuntu/some_image.jpeg"
22 | 
23 | output_image = preprocessing_object.preprocess( image_path=image_path,
24 |                                                 random_crop=False,
25 |                                                 resize_image=False,
26 |                                                 flip_image=False,
27 |                                                 rotate_image = False,
28 |                                                 print_white_pix_precent = False,
29 |                                                 flip_horizontal=False,
30 |                                                 flip_vertical=False,
31 |                                                 resize_image_height=256,
32 |                                                 resize_image_width=256,
33 |                                                 rotate_skew = 0,
34 |                                                 rotate_random = False,
35 |                                                 rotate_reshape = True,
36 |                                                 cropped_image_height=256,
37 |                                                 cropped_image_width=256,
38 |                                                 random_crop_white_percent = 98,
39 |                                                 random_crop_white_increment = False)
40 | 
41 | ```
42 | 
43 | ## Image Generator
44 | 
45 | Added functions which are used very repetetively in Generating ramdom images with lot of custom controls will be there in this module
46 | for example  - 
47 | * multiple image rotate
48 | * multiple random crops
49 | 
50 | * This will be a single wrapper for all the image generator functions
51 | 
52 | ## Example code
53 | 
54 | Output will be a generator .. which can be iterated to load the image and information on the image.
55 | 
56 | 
57 | ```python
58 | 
59 | from yubiai.vision.utility.preprocess import image_preprocessing
60 | 
61 | preprocessing_object = image_preprocessing()
62 | 
63 | image_path = "/home/ubuntu/some_image.jpeg"
64 | 
65 | output_image_gen = preprocessing_object.image_generator(image_path=image_path,
66 |                                                         cropped_image_height=256,
67 |                                                         cropped_image_width=256,   
68 |                                                         random_crop_white_percent = 70,
69 |                                                         random_crop_white_increment = True,
70 |                                                         rotate_random = False,
71 |                                                         multiple_rotated_images = False,
72 |                                                         multiple_random_crops = True,
73 |                                                         rotate_reshape = True,
74 |                                                         n_random = 30,
75 |                                                         n_rotated = 10,
76 |                                                         rotate_skew = 30,        
77 |                                                         seed = 5)
78 | for i in range(10):
79 |     img, img_info = next(output_image_gen)
80 |     print(img_info)
81 |     plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
82 |     plt.show()
83 | 
84 | ```
85 | 


--------------------------------------------------------------------------------
/yubiai/vision/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yubi2Community/YubiAI/2f3ea321b8cd525ee9f0c6fff2fa3495b4e216c8/yubiai/vision/utility/__init__.py


--------------------------------------------------------------------------------
/yubiai/vision/utility/preprocess.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### Author : Sanprit Nayan (github:sanprit)
  3 | ### Created Date : 27 Sept 2022
  4 | ### Modification Date : 2 Feb 2022 (github:navin-kumar-k)
  5 | ###
  6 | 
  7 | import numpy as np
  8 | import cv2
  9 | from PIL import Image
 10 | from scipy import ndimage
 11 | 
 12 | class image_preprocessing():
 13 |     
 14 |     def get_multiple_rotated_images(self):
 15 |         self.rotate_random = True
 16 |         np.random.seed(self.seed)
 17 |         if not self.n_rotated:
 18 |             print("Enter No. of rotated_images to genrate")
 19 |         for i in range(self.n_rotated):
 20 |             yield self.get_rotated_image(), self.img_info
 21 |     
 22 |     def get_multiple_random_crop(self, img):
 23 |         np.random.seed(self.seed)
 24 |         if not self.n_random:
 25 |             print("Enter No. of random_images to genrate")
 26 |         for i in range(self.n_random):
 27 |             yield self.get_random_crop(img), self.img_info
 28 |             
 29 |     def get_multiple_rotated_random_crop(self):
 30 |         np.random.seed(self.seed)
 31 |         self.rotate_random = True
 32 |         for i in range(self.n_rotated):
 33 |             self.new_img = self.get_rotated_image()
 34 |             for i in range(self.n_random):
 35 |                 yield self.get_random_crop(self.new_img), self.img_info
 36 |                     
 37 |     def get_resized_image(self):
 38 |         '''
 39 |         @input image : array of image eg: image= numpy.array(Image.open(<image_path>))
 40 |         @input cimage_height : height of desired crop image
 41 |         @input image_width : width of desired crop image=
 42 |         @process : selects only mentioned portion of image
 43 |         @output : returns array of final image
 44 |         '''
 45 |         if(self.resize_image_width <=0) or (self.resize_image_width > self.image.shape[0]):
 46 |             print('select desired resize width in range(0, {image_width})'.format(image_width=str(self.image.shape[1])))
 47 |             return
 48 |         if(self.resize_image_height <=0) or (self.resize_image_height > self.image.shape[1]):
 49 |             print('select desired resize height in range(0, {image_height})'.format(image_height=str(self.image.shape[0])))
 50 |             return
 51 |         self.image = cv2.resize(self.image, (self.resize_image_width, self.resize_image_height),interpolation = cv2.INTER_AREA)
 52 | 
 53 |         return self.image
 54 | 
 55 |     def get_flipped_image(self):
 56 |         '''
 57 |         @input image : array of image eg: image= numpy.array(Image.open(<image_path>))
 58 |         @input horizontal : whether image to flipped horizontally or not
 59 |         @input vertical : whether image to flipped vertically or not
 60 |         @output : returns array of flipped image
 61 |         '''
 62 |         if self.flip_vertical and self.flip_horizontal:
 63 |             return cv2.flip(self.image,-1)
 64 |         elif self.flip_horizontal:
 65 |             return cv2.flip(self.image,1)
 66 |         elif self.flip_vertical:
 67 |             return cv2.flip(self.image,0)
 68 |         else:
 69 |             return self.image
 70 | 
 71 |     def get_rotated_image(self):        
 72 |         '''
 73 |         @input image : array of image eg: image= numpy.array(Image.open(<image_path>))
 74 |         @rotate skew : degree of clockwise rotation
 75 |         @rotate random : whether to take random rotate skew or not
 76 |         @process: rotate the image in clockwise direction with respect to orignal skew of image and genrate a genral frame to fit
 77 |         all types of angle
 78 |         @output: return array of rotated image
 79 |         '''
 80 |         skew = self.rotate_skew
 81 |         if self.rotate_random:
 82 |             skew = np.random.randint(0, 360)     
 83 |         rotate_img = ndimage.rotate(self.image, -skew, reshape=self.rotate_reshape, cval=255)
 84 |         self.img_info['skew'] = skew
 85 |         
 86 |         return rotate_img
 87 |    
 88 |     def check_white_pixels(self,im):
 89 |         '''
 90 |         @input image : array of image eg: image= numpy.array(Image.open(<image_path>))
 91 |         @process : calculate the white pixels percentage in image
 92 |         @output : return float value of white pixel percentage
 93 |         '''
 94 |         ret, im = cv2.threshold(im,120,255,cv2.THRESH_BINARY)
 95 |         n_pixels = np.sum(im >= 0)
 96 |         n_white_pix = np.sum(im == 255)
 97 |         percentage = (n_white_pix/n_pixels) * 100
 98 |         return percentage    
 99 |      
100 |     def get_random_crop(self,img):   
101 |         '''
102 |         @input image : array of image eg: image= numpy.array(Image.open(<image_path>))
103 |         @input cimage_height : height of desired crop image
104 |         @input image_width : width of desired crop image
105 |         @process : selects only mentioned portion of image
106 |         @output : returns array of final image
107 |         '''
108 |         max_x = img.shape[1] - self.cropped_image_width
109 |         max_y = img.shape[0] - self.cropped_image_height
110 |         if max_x <= 0 or max_y <= 0:
111 |             print("Original Image is smaller than desired crop size")
112 |             return 0
113 |         threshold = self.random_crop_white_percent
114 |         count = 0
115 |         percentage = 100
116 |         while percentage > threshold and threshold < 99:
117 |             x = np.random.randint(0, max_x)
118 |             y = np.random.randint(0, max_y)
119 |             crop = img[y: y + self.cropped_image_height, x: x + self.cropped_image_width]
120 |             percentage = self.check_white_pixels(crop)
121 |             count +=1
122 |             self.img_info['random_crop_white_pix'] = percentage
123 |             self.img_info['random_crop_coordinates'] = (x,y)
124 |             
125 |             if count == 15 and self.random_crop_white_increment:
126 |                 threshold += 3
127 |                 count = 0
128 |             elif count == 15:
129 |                 print("Generated random_crop of more than specified percentage, Max Tries reached")
130 |                 return crop
131 |     
132 |         if threshold >= 99:
133 |             print("Generated random_crop of More than 99% white pixels")
134 |             return crop        
135 |         else:
136 |             return crop
137 |             
138 |     def preprocess(self,
139 |                  image_path,
140 |                  seed = 0,
141 |                  random_crop = False, 
142 |                  resize_image = False,
143 |                  flip_image = False,
144 |                  rotate_image = False,
145 |                  print_white_pix_precent = False,  
146 |                  cropped_image_height=256,
147 |                  cropped_image_width=256,
148 |                  flip_horizontal=False, 
149 |                  flip_vertical= False,
150 |                  resize_image_height=256, 
151 |                  resize_image_width=256,
152 |                  rotate_skew = 0,
153 |                  rotate_random = False,
154 |                  random_crop_white_percent = 98,
155 |                  random_crop_white_increment = False,
156 |                  rotate_reshape = True
157 |                  ):
158 |         '''
159 |         image_path: Path of image (Type=str)
160 |         random_crop: Create a random crop of desired height and width (Type=Boolean)
161 |             cropped_image_height: height of desired image height, defalut=256 (Type=Int)
162 |             cropped_image_width: width of desired image width, default = 256  (Type=Int)
163 |             random_crop_white_percent: threshold value of white percent in cropped image (Type=Int)
164 |             random_crop_white_increment: increase the value of threshold by 3 after 15 unsuccessful tries max 98 (Type=Boolean)
165 |         resize_image: Create a resized image of desired height and width (Type=Boolean)
166 |             resize_image_height: height of desired image height, defalut=256 (Type=Int)
167 |             resize_image_width:  width of desired image width, default = 256  (Type=Int)
168 |         flip_image: Create a fliped image based on desired axis
169 |             flip_horizontal: whether image to flipped horizontally or not
170 |             flip_vertical: whether image to flipped vertically or not
171 |         rotate_image: Create a rotated image on the desired skew from original (Type=Boolean)
172 |             rotate_skew: angle to be rotated in clockwise direction (Type=Int)
173 |             rotate_random: randomly rotate the image (Type=Boolean)
174 |         print_white_pix_precent: Print the white pixel percentage in image (Type=Boolean)
175 |         '''
176 |         self.image = np.array(Image.open(image_path))
177 |         self.rotate_reshape = rotate_reshape
178 |         self.resize_image = resize_image
179 |         self.flip_image = flip_image
180 |         self.rotate_image = rotate_image
181 |         self.random_crop = random_crop
182 |         self.flip_horizontal = flip_horizontal
183 |         self.flip_vertical = flip_vertical
184 |         self.resize_image_height = resize_image_height
185 |         self.resize_image_width = resize_image_width
186 |         self.rotate_skew = rotate_skew
187 |         self.rotate_random = rotate_random
188 |         self.cropped_image_height = cropped_image_height
189 |         self.cropped_image_width = cropped_image_width
190 |         self.random_crop_white_percent = random_crop_white_percent
191 |         self.print_white_pix_precent = print_white_pix_precent
192 |         self.random_crop_white_increment = random_crop_white_increment
193 |         self.random_crop_white_percent = random_crop_white_percent
194 |         self.seed = seed
195 |         self.img_info = {'random_crop_white_pix': 0, 'random_crop_coordinates':(0,0), 'skew':0}
196 | 
197 |         if self.flip_image:
198 |             self.image = self.get_flipped_image()
199 |         if self.rotate_image:
200 |             np.random.seed(self.seed)
201 |             self.image = self.get_rotated_image()
202 |         if self.random_crop:
203 |             np.random.seed(self.seed)
204 |             self.image = self.get_random_crop(self.image)
205 |         if self.resize_image:
206 |             self.image = self.get_resized_image()
207 |         if self.print_white_pix_precent:
208 |             print(self.check_white_pixels(self.image))
209 | 
210 |         return self.image
211 |         
212 |     def image_generator(self,image_path,
213 |                  seed = 0, 
214 |                  multiple_rotated_images = False,
215 |                  multiple_random_crops = False,
216 |                  cropped_image_height=256,
217 |                  cropped_image_width=256,
218 |                  random_crop_white_percent = 98,
219 |                  random_crop_white_increment = False,
220 |                  rotate_random = True,
221 |                  n_random = 50,
222 |                  n_rotated = 50,
223 |                  rotate_skew = 0,
224 |                  rotate_reshape = True       
225 |                   ):
226 |         '''
227 |         multiple_rotate: Genrate multiple random rotation of the image (Type=Boolean)
228 |             n_rotated: No. of rotated images to genrate, defalut=50 (Type=Int) 
229 |         multiple_random: Genrate multiple random crops of the image (Type=Boolean)
230 |             cropped_image_height: height of desired image height, defalut=256 (Type=Int)
231 |             cropped_image_width: width of desired image width, default = 256  (Type=Int)
232 |             n_random: No. of random crop of images to genrate, defalut=50 (Type=Int)
233 |         random_crop_white_percent: threshold value of white percent in cropped image (Type=Int)
234 |         random_crop_white_increment: increase the value of threshold by 3 after 15 unsuccessful tries max 98 (Type=Boolean)
235 |         seed: seed value for random generators to replicate the results 
236 |         @Note 
237 |         multiple_rotate and multiple_random will output genrator instead of image which can iterated using next(output_gen)
238 |         '''
239 |         self.image = np.array(Image.open(image_path))
240 |         self.seed = seed
241 |         self.rotate_skew = rotate_skew
242 |         self.rotate_reshape = rotate_reshape
243 |         self.rotate_random = rotate_random
244 |         self.cropped_image_height = cropped_image_height
245 |         self.cropped_image_width = cropped_image_width
246 |         self.random_crop_white_percent = random_crop_white_percent
247 |         self.random_crop_white_increment = random_crop_white_increment
248 |         self.n_random = n_random
249 |         self.n_rotated = n_rotated
250 |         self.multiple_rotated_images = multiple_rotated_images
251 |         self.multiple_random_crops = multiple_random_crops
252 |         self.new_img = None
253 |         self.img_info = {'random_crop_white_pix': 0, 'random_crop_coordinates':(0,0), 'skew':0}
254 |          
255 |         if self.multiple_rotated_images and self.multiple_random_crops:
256 |             return self.get_multiple_rotated_random_crop()
257 |         if self.multiple_rotated_images:
258 |             return self.get_multiple_rotated_images()
259 |         self.image = self.get_rotated_image()
260 |         
261 |         if self.multiple_random_crops:
262 |             return self.get_multiple_random_crop(self.image)
263 | 
264 |             


--------------------------------------------------------------------------------