├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.rst ├── conda └── environments │ └── wordbatch_dev.yml ├── data └── Tweets.csv ├── scripts ├── backends_benchmark.py ├── classify_airline_sentiment.py ├── decorator_test.py ├── wordbag_regressor.py ├── wordbag_regressor_spark.py ├── wordhash_regressor.py ├── wordseq_regressor.py └── wordvec_regressor.py ├── setup.cfg ├── setup.py └── wordbatch ├── __init__.py ├── batcher.py ├── data_utils.py ├── extractors ├── MurmurHash3.cpp ├── MurmurHash3.h ├── __init__.py └── extractors.pyx ├── models ├── __init__.py ├── avx_ext.c ├── avx_ext.h ├── fm_ftrl.pyx ├── ftrl.pyx ├── ftrl32.pyx ├── nn_relu_h1.pyx └── nn_relu_h2.pyx ├── pipelines ├── __init__.py ├── apply.py ├── apply_batch.py ├── apply_groupby.py ├── batch_transformer.py ├── feature_union.py └── wordbatch.py └── transformers ├── __init__.py ├── dictionary.py ├── text_normalizer.py └── tokenizer.py /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | scripts/ 3 | .git 4 | .gitignore 5 | Dockerfile* 6 | Makefile 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # Other 127 | data/tripadvisor/json/* 128 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.7.12 as builder 2 | 3 | WORKDIR /app 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV CONDA_PREFIX /opt/conda 7 | 8 | COPY conda/environments/wordbatch_dev.yml /app 9 | 10 | RUN apt-get --allow-releaseinfo-change update && \ 11 | apt-get install -y --no-install-recommends build-essential gcc && \ 12 | conda update -n base conda && \ 13 | conda env update -f /app/wordbatch_dev.yml 14 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | 3 | include wordbatch/__init__.py 4 | include wordbatch/wordbatch.py 5 | 6 | include wordbatch/extractors/__init__.py 7 | include wordbatch/extractors/extractors.pyx 8 | include wordbatch/extractors/MurmurHash3.cpp 9 | include wordbatch/extractors/MurmurHash3.h 10 | 11 | include wordbatch/models/__init__.py 12 | include wordbatch/models/ftrl.pyx 13 | include wordbatch/models/fm_ftrl.pyx 14 | include wordbatch/models/nn_relu_h1.pyx 15 | include wordbatch/models/nn_relu_h2.pyx 16 | include wordbatch/models/avx_ext.c 17 | include wordbatch/models/avx_ext.h 18 | 19 | include scripts/wordbag_regressor.py 20 | include scripts/wordhash_regressor.py 21 | include scripts/wordseq_regressor.py 22 | include scripts/wordvec_regressor.py 23 | include scripts/classify_airline_sentiment.py 24 | include scripts/backends_benchmark.py 25 | 26 | include data/Tweets.csv -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE_NAME=wordbatch 2 | CONTAINER_NAME=wordbatch_dev 3 | 4 | build: ## Build the image 5 | docker build -t $(IMAGE_NAME) . 6 | 7 | run-dev: ## Run container for development 8 | docker run \ 9 | -it \ 10 | --name=$(CONTAINER_NAME) \ 11 | -v $(shell pwd):/wordbatch $(IMAGE_NAME) bash 12 | 13 | attach: ## Run a bash in a running container 14 | docker start $(CONTAINER_NAME) && docker attach $(CONTAINER_NAME) 15 | 16 | stop: ## Stop and remove a running container 17 | docker stop $(CONTAINER_NAME); docker rm $(CONTAINER_NAME) 18 | 19 | test: 20 | docker start $(CONTAINER_NAME) 21 | docker exec -it $(CONTAINER_NAME) env | grep PATH 22 | docker exec -it $(CONTAINER_NAME) which python 23 | docker exec -it $(CONTAINER_NAME) python -c '\ 24 | import wordbatch;\ 25 | from wordbatch import models;\ 26 | print(wordbatch.__version__)' 27 | #docker exec -it $(CONTAINER_NAME) python -c 'print("hello")' 28 | #docker exec -it $(CONTAINER_NAME) echo "Hello from container!" -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Wordbatch 1.4.9 3 | =============== 4 | 5 | Overview 6 | ======== 7 | 8 | Python library for distributed AI processing pipelines, using swappable scheduler backends. 9 | 10 | Wordbatch parallelizes task pipelines as minibatches processed by a chosen scheduler backend. This allows 11 | the user to develop AI programs on a local workstation or laptop, and scale the same 12 | solution on a cluster or the cloud, simply by changing the pipeline backend to a distributed scheduler such as Spark, 13 | Dask or Ray. A backend can be chosen based on performance characteristics on a particular task, and swapped for 14 | different situations. For example, an AI model can be trained using a distributed backend, and then debugged or 15 | deployed using a single serial process. 16 | 17 | The library is organized around the orchestrator class Batcher, and Sklearn-compatible components, 18 | split into Pipelines, Transformers, Extractors and Models. These extend the Scikit-learn API with a 19 | fit_partial()-method, that enables transformers and models to be used in a streaming fashion. 20 | The current set of components has been developed mostly for text processing tasks, but components for other domains 21 | can be developed based on the available classes. 22 | 23 | Requirements 24 | ============ 25 | Linux / Windows / macOS. Python 3.6 / 3.7 26 | 27 | Installation 28 | ============ 29 | pip install wordbatch 30 | 31 | macOS: compile using GCC-7 (https://github.com/anttttti/Wordbatch/issues/1) 32 | 33 | linux: make sure GCC and its required libraries are installed before installing Wordbatch 34 | | sudo apt install gcc 35 | | sudo apt-get update 36 | | sudo apt-get install --reinstall build-essential 37 | 38 | Getting started 39 | =============== 40 | 41 | | from wordbatch.models import FTRL 42 | | from wordbatch.extractors import WordBag 43 | | from wordbatch.pipelines import WordBatch 44 | | from wordbatch.batcher import Batcher 45 | | 46 | | wb= WordBatch(extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0), 47 | | batcher=Batcher(backend="multiprocessing")) 48 | | 49 | | clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1) 50 | | 51 | | train_texts= ["Cut down a tree with a herring? It can't be done.", 52 | | "Don't say that word.", 53 | | "How can we not say the word if you don't tell us what it is?"] 54 | | train_labels= [1, 0, 1] 55 | | test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"] 56 | | 57 | | clf.fit(wb.fit_transform(train_texts), train_labels) 58 | | print(clf.predict(wb.transform(test_texts))) 59 | | 60 | | import ray 61 | | ray.init() 62 | | wb.batcher.backend= "ray" 63 | | wb.batcher.backend_handle= ray 64 | | 65 | | clf.fit(wb.fit_transform(train_texts), train_labels) 66 | | print(clf.predict(wb.transform(test_texts))) 67 | 68 | 69 | Components 70 | ========== 71 | 72 | Batcher 73 | ------- 74 | Batcher orchestrates MapReduce processing of tasks using a backend, by splitting input data into separately processed 75 | minibatches. Currently three local (serial, multiprocessing, Loky) and three distributed backends (Spark, Dask, 76 | Ray) are supported. Some distributed backends will process the tasks concurrently as a graph of lazily evaluated 77 | futures, with Batcher dynamically sending the graph for the backend to process. All three supported distributed 78 | backends allow real time monitoring of the processing pipeline using the backend's own GUI. 79 | 80 | 81 | Pipelines 82 | --------- 83 | Pipelines are classes that send functions, methods and classes to Batcher for processing. Unlike other components in 84 | Wordbatch, pipelines contain a reference to Batcher, and are never referenced themselves in the calls sent to Batcher. 85 | This prevents trying to serialize and send the backend handle itself. The simplest pipeline is Apply, 86 | which processes a function or method over the input data row-by-row. WordBatch is a full complex pipeline for text 87 | processing, with optional steps such as text normalization, spelling correction, stemming, feature extraction, and 88 | LZ4-caching of results. 89 | 90 | 91 | Transformers 92 | ------------ 93 | Transformers are transformer classes extending the Scikit-learn API, by accepting a Batcher instance as argument 94 | of fit and transform methods. Transformers won't store Batcher references, allowing the transformer objects to be sent 95 | to distributed workers. This allows transformers to do MapReduce operations as part of its methods, for example 96 | gathering a dictionary of words from data when fitting a Dictionary. The current set of transformers are 97 | text-specific classes, such as Dictionary, Tokenizer and TextNormalizer. 98 | 99 | 100 | Extractors 101 | ---------- 102 | Extractors are transformer classes which don't directly call Batcher. Since extractors can't call Batcher directly, 103 | they are mostly immutable and used for their transform() method calls distributed using a pipeline. The current set of 104 | extractors is Cython-optimized, and aside from PandasHash intended for text feature extraction. These are: 105 | 106 | - WordHash is wrapper for Scikit-learn HashingVectorizer, extended with option for LZ4-caching 107 | - WordBag is a flexible alternative to Wordhash, with options such as IDF and per n-gram order weighting 108 | - WordSeq provides sequences of word integers, as used by deep learning language models 109 | - WordVec embeds words into word vector representations 110 | - PandasHash extracts hashed features from a Pandas DataFrame, similar to VowpalWabbit's feature extraction 111 | 112 | 113 | Models 114 | ------ 115 | Models are predictive models such as classifiers. Similar to extractors, they don't directly call Batcher, but are 116 | Scikit-learn compatible and distributed using a pipeline if needed. Currently four 117 | OpenMP-multithreaded L1&L2-regularized online learning models are provided, for single-label regression and 118 | classification: 119 | 120 | - FTRL : Linear model Proximal-FTRL that has become the most popular algorithm for online learning of linear models in Kaggle competions. The Cython-optimized implementation should be the fastest available version of FTRL. 121 | - FM_FTRL : Factorization Machines. Linear effects estimated with FTRL and factor effects estimated with adaptive SGD. Prediction and estimation multithreaded across factors. 122 | - NN_Relu_H1 : Neural Network with 1 hidden layer and Rectified Linear Unit activations, estimated with adaptive SGD. Prediction and estimation multithreaded across hidden layer. 123 | - NN_Relu_H2: Neural Network with 2 hidden layers and Rectified Linear Unit activations, estimated with adaptive SGD. Prediction multithreaded across 2nd hidden layer, estimation across 1st hidden layer outputs. 124 | 125 | The adaptive SGD optimizer works like Adagrad, but pools the adaptive learning rates across hidden nodes using the same 126 | feature. This makes learning more robust and requires less memory. FM_FTRL uses AVX2-optimization, so that processors 127 | supporting AVX2 will run the factorization model up to four times faster. 128 | 129 | Example scripts 130 | =============== 131 | 132 | The directory /scripts/ contains scripts for demonstrating and testing basic uses of the toolkit. To run the scripts 133 | one should first install the dependencies: Keras, NLTK, TextBlob, Pandas, Ray, Dask Distributed and PySpark. 134 | The scripts also use the TripAdvisor dataset (http://times.cs.uiuc.edu/~wang296/Data/), and the 135 | precomputed word embeddings glove.twitter.27B.100d and glove.6B.50d (http://nlp.stanford.edu/projects/glove/). Test 136 | data from Crowdflower Open data & Kaggle is provided in the /data directory. 137 | 138 | Airline Classification Example 139 | ------------------------------ 140 | classify_airline_sentiment.py shows training and combining predictions with four classifier scripts that use the 141 | Wordbatch extractors and models: wordhash_regressor.py, wordbag_regressor.py, wordseq_regressor.py and 142 | wordvec_regressor.py. The header part of the script can be modified to choose the backend. By default Ray is used and 143 | passed to the other scripts. 144 | 145 | Backends Benchmark Example 146 | -------------------------- 147 | backends_benchmark.py shows how to benchmark different backends on two simple pipeline tasks: 148 | using ApplyBatch with Scikit-learn HashingVectorizer, and running WordBatch Pipeline with most of its possible 149 | processing steps. Dask and Spark are commented out by default, as these need command-line configuration. 150 | All three distributed backends can be configured to run across a distributed cluster, as done in the 151 | commented-out code. 152 | 153 | 154 | Contributors 155 | ============ 156 | Antti Puurula 157 | 158 | Anders Topper 159 | 160 | Cheng-Tsung Liu 161 | -------------------------------------------------------------------------------- /conda/environments/wordbatch_dev.yml: -------------------------------------------------------------------------------- 1 | name: base 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python>=3.7.3 7 | - numpy==1.20.3 8 | - cython==0.29.10 9 | - pip 10 | # - nltk>=3.4.3 11 | # - textblob 12 | - pip: 13 | - wordbatch==1.4.8 14 | # - keras==2.3.1 15 | # - pyspark==2.4.0 16 | # - dask>=2.1.0 17 | # - distributed>=2.1.0 18 | prefix: /opt/conda -------------------------------------------------------------------------------- /scripts/backends_benchmark.py: -------------------------------------------------------------------------------- 1 | import re 2 | from contextlib import closing, contextmanager 3 | import time 4 | from wordbatch.pipelines import WordBatch, Apply, ApplyBatch 5 | from wordbatch.extractors import WordHash, WordBag 6 | from wordbatch.transformers import Tokenizer, Dictionary 7 | from wordbatch.batcher import Batcher 8 | import os 9 | import json 10 | from sklearn.feature_extraction.text import HashingVectorizer 11 | import warnings 12 | import pandas as pd 13 | import multiprocessing 14 | 15 | # http://sifaka.cs.uiuc.edu/~wang296/Data/LARA/TripAdvisor/ 16 | tripadvisor_dir= "../data/tripadvisor/json" 17 | 18 | # Configure below to allow Dask / Spark 19 | # scheduler_ip= "169.254.93.14" 20 | # from dask.distributed import Client 21 | # #dask-scheduler --host 169.254.93.14 22 | # #dask-worker 169.254.93.14:8786 --nprocs 16 23 | # dask_client = Client(scheduler_ip+":8786") 24 | # 25 | # from pyspark import SparkContext, SparkConf 26 | # # conf= SparkConf().setAll([('spark.executor.memory', '4g'), ('spark.driver.memory', '30g'), 27 | # # ('spark.driver.maxResultSize', '10g')]) 28 | # import os 29 | # os.environ['PYSPARK_PYTHON'] = '/home/USERNAME/anaconda3/envs/ENV_NAME/bin/python' 30 | # conf= SparkConf().setAll([('spark.executor.memory', '4g'), ('spark.driver.memory', '30g'), 31 | # ('spark.driver.maxResultSize', '10g')]).setMaster("spark://169.254.93.14:7077") 32 | # spark_context = SparkContext(conf=conf) 33 | 34 | import ray 35 | #ray start --head --node-ip-address 169.254.93.14 36 | #ray.init(redis_address=scheduler_ip+":57113") #Change port accordingly 37 | ray.init() 38 | 39 | @contextmanager 40 | def timer(name): 41 | t0 = time.time() 42 | yield 43 | print(name + " done in " + str(time.time() - t0) + "s") 44 | 45 | if 1==1: 46 | texts= [] 47 | for jsonfile in os.listdir(tripadvisor_dir): 48 | with open(tripadvisor_dir + "/" + jsonfile, 'r') as inputfile: 49 | for line in inputfile: 50 | try: 51 | line = json.loads(line.strip()) 52 | except: 53 | continue 54 | for review in line["Reviews"]: 55 | texts.append(review["Content"]) 56 | # pd.to_pickle(texts, "tripadvisor_data.pkl") 57 | # else: 58 | # texts= pd.read_pickle("tripadvisor_data.pkl") 59 | 60 | non_alphanums = re.compile('[\W+]') 61 | nums_re= re.compile("\W*[0-9]+\W*") 62 | triples_re= re.compile(r"(\w)\1{2,}") 63 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"), 64 | re.compile(" '"),re.compile(" +")] 65 | from nltk.stem.porter import PorterStemmer 66 | stemmer= PorterStemmer() 67 | 68 | def normalize_text(text): 69 | text= text.lower() 70 | text= nums_re.sub(" NUM ", text) 71 | text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1]) 72 | return text 73 | 74 | print(len(texts)) 75 | backends= [ 76 | ['serial', ""], 77 | ['multiprocessing', ""], 78 | ['loky', ""], 79 | # ['dask', dask_client], # Uncomment once configured 80 | # ['spark', spark_context], # Uncomment once configured 81 | ['ray', ray] 82 | ] 83 | 84 | tasks= [ 85 | "ApplyBatch", 86 | "WordBag", 87 | ] 88 | 89 | data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000] 90 | 91 | for task in tasks: 92 | for data_size in data_sizes: 93 | texts_chunk = texts[:data_size] 94 | print("Task:", task, "Data size:", data_size) 95 | for backend in backends: 96 | batcher = Batcher(procs=multiprocessing.cpu_count(), minibatch_size=5000, backend=backend[0], backend_handle=backend[1]) 97 | try: 98 | with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings(): 99 | warnings.simplefilter("ignore") 100 | if task=="ApplyBatch": 101 | hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text, 102 | ngram_range=(1, 2), norm='l2') 103 | t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk) 104 | print(t.shape, t.data[:5]) 105 | 106 | if task=="WordBag": 107 | wb = WordBatch(normalize_text=normalize_text, 108 | dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0), 109 | tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer), 110 | extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0), 111 | batcher= batcher, 112 | verbose= 0) 113 | t = wb.fit_transform(texts_chunk) 114 | print(t.shape, t.data[:5]) 115 | except: 116 | print("Failed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]") 117 | print("") -------------------------------------------------------------------------------- /scripts/classify_airline_sentiment.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sp 4 | import re 5 | import sklearn 6 | from sklearn.model_selection import * 7 | from sklearn.ensemble import RandomForestRegressor 8 | import textblob 9 | from math import * 10 | import time, datetime 11 | import multiprocessing 12 | from wordbatch.batcher import Batcher 13 | import wordbatch 14 | 15 | print("Wordbatch version:", wordbatch.__version__) 16 | pd.set_option('display.max_rows', 500) 17 | pd.set_option('display.max_columns', 500) 18 | pd.set_option('display.max_colwidth', 1000) 19 | 20 | backend= "ray" 21 | #backend= "multiprocessing" 22 | minibatch_size= 10000 23 | if backend == "ray": 24 | import ray 25 | ray.init() 26 | b = Batcher(backend="ray", backend_handle=ray, minibatch_size=minibatch_size) 27 | if backend == "multiprocessing": 28 | b = Batcher(backend="multiprocessing", minibatch_size=minibatch_size) 29 | 30 | # http://sifaka.cs.uiuc.edu/~wang296/Data/LARA/TripAdvisor/ 31 | tripadvisor_dir= "../data/tripadvisor/json" 32 | if __name__ == "__main__": 33 | start_time= time.time() 34 | print(datetime.datetime.now()) 35 | 36 | #df= pd.DataFrame.from_csv("../data/Tweets.csv", encoding="utf8") 37 | df = pd.read_csv("../data/Tweets.csv", encoding="utf8") 38 | def sentiment_to_label(sentiment): 39 | if sentiment=="neutral": return 0 40 | if sentiment=="negative": return -1 41 | return 1 42 | d_sentiment_to_label= {"neutral":0, "negative":-1, "positive":1} 43 | df['airline_sentiment_confidence']= df['airline_sentiment_confidence'].astype('str') 44 | df['sentiment'] = (df['airline_sentiment']).map(d_sentiment_to_label) 45 | df= df[['text','sentiment']] 46 | 47 | re_attags= re.compile(" @[^ ]* ") 48 | re_spaces= re.compile("\w+]") 49 | df['text']= df['text'].apply(lambda x: re_spaces.sub(" ",re_attags.sub(" ", " "+x+" "))[1:-1]) 50 | df= df.drop_duplicates(subset=['text']) 51 | df.index= df['id']= range(df.shape[0]) 52 | 53 | non_alphanums=re.compile('[^A-Za-z]+') 54 | def normalize_text(text): return non_alphanums.sub(' ', text).lower().strip() 55 | df['text_normalized']= df['text'].map(lambda x: normalize_text(x)) 56 | df['textblob_score']= df['text_normalized'].map(lambda x: textblob.TextBlob(x).polarity) 57 | 58 | import wordbag_regressor 59 | print("Train wordbag regressor") 60 | wb_regressor= wordbag_regressor.WordbagRegressor("../models/wordbag_model.pkl.gz", tripadvisor_dir, b) 61 | #wb_regressor= wordbag_regressor.WordbagRegressor("../models/wordbag_model.pkl.gz") 62 | df['wordbag_score']= wb_regressor.predict(df['text'].values) 63 | print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2))) 64 | 65 | import wordhash_regressor 66 | print("Train wordhash regressor") 67 | wh_regressor= wordhash_regressor.WordhashRegressor("../models/wordhash_model.pkl.gz", tripadvisor_dir, b) 68 | #wh_regressor= wordhash_regressor.WordhashRegressor("../models/wordhash_model.pkl.gz") 69 | df['wordhash_score']= wh_regressor.predict(df['text'].values) 70 | print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2))) 71 | 72 | import wordseq_regressor 73 | print("Train wordseq regressor") 74 | ws_regressor = wordseq_regressor.WordseqRegressor("../models/wordseq_model.pkl.gz", tripadvisor_dir, b) 75 | #ws_regressor = wordseq_regressor.WordseqRegressor("../models/wordseq_model.pkl.gz") 76 | df['wordseq_score']= ws_regressor.predict_batch(df['text'].values) 77 | print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2))) 78 | 79 | import wordvec_regressor 80 | print("Train wordvec regressor") 81 | wv_regressor= wordvec_regressor.WordvecRegressor("../models/wordvec_model.pkl.gz", tripadvisor_dir, b) 82 | #wv_regressor= wordvec_regressor.WordvecRegressor("../models/wordvec_model.pkl.gz") 83 | df['wordvec_score'] = wv_regressor.predict(df['text'].values) 84 | print(df['wordvec_score']) 85 | print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2))) 86 | 87 | df['tweet_len']= df['text'].map(lambda x: log(1+len(x))) 88 | df['tweet_wordcount']= df['text'].map(lambda x: log(1+len(x.split()))) 89 | 90 | print(df) 91 | full_preds= np.zeros(df.shape[0]) 92 | columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] #Mean Squared Error: 0.28730889581 93 | #columns_pick= ['tweet_len', 'tweet_wordcount', 'wordhash_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] # 94 | #columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] # 95 | #columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordvec_score', 'textblob_score'] # 96 | #columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordseq_score', 'textblob_score'] # 97 | 98 | kf= KFold(n_splits=10, shuffle=True, random_state=0) 99 | for train_index, dev_index in kf.split(range(df.shape[0])): 100 | df_train= df.iloc[train_index] 101 | df_dev= df.iloc[dev_index] 102 | clf= RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=None, min_samples_split=5, 103 | min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto', 104 | max_leaf_nodes=None, bootstrap=True, oob_score=False, 105 | n_jobs=multiprocessing.cpu_count(), random_state=0, 106 | verbose=0, warm_start=False) 107 | 108 | clf.fit(df_train[columns_pick], df_train['sentiment']) 109 | preds= clf.predict(df_dev[columns_pick]) 110 | for x in range(len(preds)): full_preds[df_dev['id'].iloc[x]]= preds[x] 111 | 112 | df['preds']= sp.clip(full_preds, -1.0, 1.0) 113 | 114 | print(datetime.datetime.now()) 115 | print(("%s minutes ---" % round(((time.time() - start_time)/60),2))) 116 | 117 | c_mse= sklearn.metrics.mean_squared_error(df['sentiment'], df['preds'], sample_weight=None, 118 | multioutput='uniform_average') 119 | print("Mean Squared Error:", c_mse) 120 | -------------------------------------------------------------------------------- /scripts/decorator_test.py: -------------------------------------------------------------------------------- 1 | import re 2 | from contextlib import contextmanager 3 | import time 4 | from wordbatch.pipelines import decorator_apply as apply 5 | from wordbatch.batcher import Batcher 6 | import warnings 7 | import pandas as pd 8 | from nltk.stem.porter import PorterStemmer 9 | from numba import int64, float64 10 | import os 11 | import json 12 | 13 | tripadvisor_dir= "../data/tripadvisor/json" 14 | 15 | import ray 16 | #ray start --head --node-ip-address 169.254.93.14 17 | #ray.init(redis_address=scheduler_ip+":57113") #Change port accordingly 18 | ray.init() 19 | 20 | @contextmanager 21 | def timer(name): 22 | t0 = time.time() 23 | yield 24 | print(name + " done in " + str(time.time() - t0) + "s") 25 | 26 | if 1==1: 27 | texts= [] 28 | for jsonfile in os.listdir(tripadvisor_dir): 29 | with open(tripadvisor_dir + "/" + jsonfile, 'r') as inputfile: 30 | for line in inputfile: 31 | try: 32 | line = json.loads(line.strip()) 33 | except: 34 | continue 35 | for review in line["Reviews"]: 36 | texts.append(review["Content"]) 37 | # pd.to_pickle(texts, "tripadvisor_data.pkl") 38 | # else: 39 | # texts= pd.read_pickle("tripadvisor_data.pkl") 40 | 41 | non_alphanums = re.compile('[\W+]') 42 | stemmer= PorterStemmer() 43 | 44 | def normalize_text(text): 45 | text= " ".join([word for word in non_alphanums.sub(" ",text.lower()).strip().split() if len(word)>1]) 46 | return text 47 | 48 | print(len(texts)) 49 | backends= [ 50 | ###['multiprocessing', ""], #doesn't serialize lambda functions 51 | ['ray', ray], 52 | ['loky', ""], 53 | ['serial', ""], 54 | ] 55 | 56 | #data_size= 200000 57 | #data_size= 500000 58 | data_size= 1280000 59 | 60 | def test_backend(texts, backend): 61 | df = pd.DataFrame(texts, columns=['text']) 62 | df['text']= df['text'].fillna("") 63 | batcher = Batcher(minibatch_size=5000, backend=backend[0], backend_handle=backend[1]) 64 | #batcher = Batcher(minibatch_size=data_size//8, backend=backend[0], backend_handle=backend[1]) 65 | if backend[0]=="ray": 66 | backend[1].shutdown() 67 | backend[1].init() 68 | 69 | try: 70 | with timer("Text normalization: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings(): 71 | warnings.simplefilter("ignore") 72 | df['text_normalized'] = apply(normalize_text, batcher)(df['text']) 73 | with timer("Text normalization without Wordbatch: " + str(len(df)) + "," + backend[0]) \ 74 | , warnings.catch_warnings(): 75 | warnings.simplefilter("ignore") 76 | df['text_normalized'] = [normalize_text(x) for x in df['text']] 77 | except Exception as e: 78 | print("Failed text normalization: " +"," + str(len(df)) + "," + backend[0]) 79 | # #"Exception:", e.split("\n")[0]) 80 | 81 | try: 82 | def div(x, y): 83 | return 0 if y==0 else x / y 84 | df['len_text'] = df['text'].str.len().astype(int) 85 | df['len_text_normalized'] = df['text_normalized'].str.len().astype(int) 86 | # list(zip(df['text_normalized'], df['text_normalized'])) 87 | # np.vstack([df['text_normalized'], df['text_normalized']]).T 88 | with timer("Text length ratio vectorized: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings(): 89 | df['len_ratio'] = apply(div, batcher, vectorize=[float64(int64, int64)])( 90 | df[['len_text', 'len_text_normalized']].values) 91 | with timer("Text length ratio without vectorization: " + str(len(df)) + "," + backend[0]), \ 92 | warnings.catch_warnings(): 93 | df['len_ratio'] = apply(lambda x:div(*x), batcher)(df[['len_text', 'len_text_normalized']].values) 94 | with timer("Text length ratio without Wordbatch: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings(): 95 | df['len_ratio'] = [div(x, y) for x, y in zip(df['len_text'], df['len_text_normalized'])] 96 | except Exception as e: 97 | print("Failed text length ratios: " +"," + str(len(df)) + "," + backend[0]) 98 | # #return 99 | 100 | try: 101 | with timer("Splitting first word: " + str(len(df)) + "," + backend[0]) \ 102 | , warnings.catch_warnings(): 103 | warnings.simplefilter("ignore") 104 | df['first_word'] = apply(lambda x: x.split(" ")[0], batcher)(df['text_normalized']) 105 | with timer("Splitting first word without Wordbatch: " + str(len(df)) + "," + backend[0]), \ 106 | warnings.catch_warnings(): 107 | warnings.simplefilter("ignore") 108 | df['first_word'] = [x.split(" ")[0] for x in df['text_normalized']] 109 | except Exception as e: 110 | print("Failed splitting first word: " + str(len(df)) + "," + backend[0]) 111 | # "Exception:", e.split("\n")[0]) 112 | 113 | try: 114 | with timer("Stemming first word: " + str(len(df)) + "," + backend[0]), \ 115 | warnings.catch_warnings(): 116 | warnings.simplefilter("ignore") 117 | df['first_word_stemmed'] = apply(stemmer.stem, batcher)(df['first_word']) 118 | with timer("Stemming first word without Wordbatch: " + str(len(df)) + "," + backend[0]), \ 119 | warnings.catch_warnings(): 120 | warnings.simplefilter("ignore") 121 | df['first_word_stemmed'] = [stemmer.stem(x) for x in df['first_word']] 122 | except Exception as e: 123 | print("Failed stemming first word: " + str(len(df)) + "," + backend[0]) 124 | # # "Exception:", e.split("\n")[0]) 125 | # 126 | try: 127 | with timer("Stemming first word (cache=1000): " + str(len(df)) + "," + backend[0]), \ 128 | warnings.catch_warnings(): 129 | warnings.simplefilter("ignore") 130 | df['first_word_stemmed'] = apply(stemmer.stem, batcher, cache=1000)(df['first_word']) 131 | 132 | with timer("Stemming first word (cache=1000) without Wordbatch: " + str(len(df)) + "," + backend[0]), \ 133 | warnings.catch_warnings(): 134 | warnings.simplefilter("ignore") 135 | from functools import lru_cache 136 | cache_stem= lru_cache(maxsize=1000)(stemmer.stem) 137 | df['first_word_stemmed'] = [cache_stem(x) for x in df['first_word']] 138 | except Exception as e: 139 | print("Failed stemming first word: " + str(len(df)) + "," + backend[0]) 140 | # "Exception:", e.split("\n")[0]) 141 | 142 | try: 143 | batcher.minibatch_size = 200 144 | with timer("Groupby aggregation: " + str(len(df)) + "," + backend[0]), \ 145 | warnings.catch_warnings(): 146 | warnings.simplefilter("ignore") 147 | group_ids, groups = zip(*df[['first_word_stemmed', 'text']].groupby('first_word_stemmed')) 148 | res = apply(lambda x: x['text'].str.len().agg('mean'), batcher)(groups) 149 | df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map( 150 | {x: y for x, y in zip(group_ids, res)}) 151 | 152 | batcher.minibatch_size = 10 153 | df['first_word_stemmed_hashbin'] = [hash(x) % 500 for x in df['first_word_stemmed']] 154 | with timer("Groupby aggregation hashbin: " + str(len(df)) + "," + backend[0]), \ 155 | warnings.catch_warnings(): 156 | warnings.simplefilter("ignore") 157 | group_ids, groups = zip(*df[['first_word_stemmed', 'text', 'first_word_stemmed_hashbin']] 158 | .groupby('first_word_stemmed_hashbin')) 159 | res = pd.concat(apply(lambda x: x.groupby('first_word_stemmed').apply( 160 | lambda z: z['text'].str.len().agg('mean')), batcher)(groups)) 161 | df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map(res) 162 | 163 | with timer("Groupby aggregation without Wordbatch: " + str(len(df)) + "," + backend[0]), \ 164 | warnings.catch_warnings(): 165 | warnings.simplefilter("ignore") 166 | group_ids, groups = zip(*df[['first_word_stemmed', 'text']].groupby('first_word_stemmed')) 167 | res = [x['text'].str.len().agg('mean') for x in groups] 168 | df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map( 169 | {x: y for x, y in zip(group_ids, res)}) 170 | del (res, group_ids, groups, df, batcher) 171 | except Exception as e: 172 | print("Failed groupby aggregation: " + str(len(df)) + "," + backend[0]) 173 | # "Exception:", e.split("\n")[0]) 174 | 175 | texts= texts[:data_size] 176 | if __name__ == '__main__': 177 | for backend in backends: 178 | test_backend(texts, backend) 179 | -------------------------------------------------------------------------------- /scripts/wordbag_regressor.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import json 8 | import gzip 9 | from wordbatch.pipelines import WordBatch 10 | from wordbatch.models import FTRL 11 | from wordbatch.extractors import WordBag 12 | import threading 13 | import sys 14 | if sys.version_info.major == 3: 15 | import pickle as pkl 16 | else: 17 | import cPickle as pkl 18 | 19 | 20 | non_alphanums = re.compile('[\W+]') 21 | nums_re= re.compile("\W*[0-9]+\W*") 22 | triples_re= re.compile(r"(\w)\1{2,}") 23 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"), 24 | re.compile(" '"),re.compile(" +")] 25 | from nltk.stem.porter import PorterStemmer 26 | stemmer= PorterStemmer() 27 | def normalize_text(text): 28 | text= text.lower() 29 | text= nums_re.sub(" NUM ", text) 30 | text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1]) 31 | return text 32 | 33 | class WordbagRegressor(object): 34 | def __init__(self, pickle_model="", datadir=None, batcher= None): 35 | self.wb = WordBatch(normalize_text=normalize_text, 36 | extractor=WordBag(hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0], 37 | hash_size=2**23, norm='l2', tf='binary', idf=50.0), 38 | batcher= batcher) 39 | 40 | self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity") 41 | if datadir==None: (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) 42 | else: self.train(datadir, pickle_model) 43 | 44 | def fit_batch(self, texts, labels, rcount): 45 | texts, labels= self.wb.batcher.shuffle_batch(texts, labels, rcount) 46 | print("Transforming", rcount) 47 | texts= self.wb.fit_transform(texts, reset= False) 48 | print("Training", rcount) 49 | self.clf.fit(texts, labels, reset= False) 50 | 51 | def train(self, datadir, pickle_model=""): 52 | texts= [] 53 | labels= [] 54 | training_data= os.listdir(datadir) 55 | rcount= 0 56 | batchsize= 100000 57 | 58 | p = None 59 | for jsonfile in training_data: 60 | with open(datadir + "/" + jsonfile, 'r') as inputfile: 61 | for line in inputfile: 62 | #if rcount > 1000000: break 63 | try: line = json.loads(line.strip()) 64 | except: continue 65 | for review in line["Reviews"]: 66 | rcount+= 1 67 | if rcount % 100000 == 0: print(rcount) 68 | if rcount % 7 != 0: continue 69 | if "Overall" not in review["Ratings"]: continue 70 | texts.append(review["Content"]) 71 | labels.append((float(review["Ratings"]["Overall"]) - 3) * 0.5) 72 | if len(texts) % batchsize == 0: 73 | if p != None: p.join() 74 | p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) 75 | p.start() 76 | texts= [] 77 | labels= [] 78 | if p != None: p.join() 79 | self.fit_batch(texts, labels, rcount) 80 | 81 | self.wb.dictionary_freeze= True 82 | 83 | if pickle_model!="": 84 | with gzip.open(pickle_model, 'wb') as model_file: 85 | backend= self.wb.batcher.backend 86 | backend_handle= self.wb.batcher.backend_handle 87 | self.wb.batcher.backend= "serial" 88 | self.wb.batcher.backend_handle = None 89 | pkl.dump((self.wb, self.clf), model_file, protocol=2) 90 | self.wb.batcher.backend = backend 91 | self.wb.batcher.backend_handle = backend_handle 92 | 93 | def predict(self, texts): 94 | counts= self.wb.transform(texts) 95 | return self.clf.predict(counts) -------------------------------------------------------------------------------- /scripts/wordbag_regressor_spark.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import json 8 | import gzip 9 | import wordbatch 10 | from wordbatch.models import FTRL 11 | from wordbatch.extractors import WordBag 12 | import threading 13 | import pandas as pd 14 | import sys 15 | if sys.version_info.major == 3: 16 | import pickle as pkl 17 | else: 18 | import cPickle as pkl 19 | 20 | non_alphanums = re.compile('[\W]') 21 | nums_re= re.compile("\W*[0-9]+\W*") 22 | triples_re= re.compile(r"(\w)\1{2,}") 23 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"), 24 | re.compile(" '"),re.compile(" +")] 25 | from nltk.stem.porter import PorterStemmer 26 | stemmer= PorterStemmer() 27 | def normalize_text(text): 28 | text= text.lower() 29 | text= nums_re.sub(" NUM ", text) 30 | text= " ".join([word for word in non_alphanums.sub(" ",text).split() if len(word)>1]) 31 | return text 32 | 33 | class WordbagRegressor(object): 34 | def __init__(self, pickle_model="", datadir=None): 35 | from pyspark import SparkContext 36 | self.sc= SparkContext() 37 | self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc, 38 | extractor=(WordBag, {"hash_ngrams":3, 39 | "hash_ngrams_weights":[-1.0, -1.0, 1.0], 40 | "hash_size":2**23, "norm":'l2', 41 | "tf":'binary', "idf":50.0})) 42 | self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity") 43 | if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) 44 | else: self.train(datadir, pickle_model) 45 | 46 | def fit_batch(self, texts, labels, rcount): 47 | print("Transforming", rcount) 48 | # if self.sc != None: 49 | # data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc) 50 | # data_rdd= self.wordbatch.transform(data_rdd) 51 | # [texts, labels]= self.wordbatch.rddbatches2lists(data_rdd) 52 | # else: 53 | # print(texts[:2]) 54 | # print(pd.Series(labels).value_counts()) 55 | texts= self.wordbatch.partial_fit_transform(texts) 56 | print("Training", rcount) 57 | self.clf.partial_fit(texts, labels) 58 | 59 | def train(self, datadir, pickle_model=""): 60 | texts= [] 61 | labels= [] 62 | training_data= os.listdir(datadir) 63 | rcount= 0 64 | batchsize= 20000 65 | 66 | p = None 67 | for jsonfile in training_data: 68 | with open(datadir + "/" + jsonfile, 'r') as inputfile: 69 | for line in inputfile: 70 | #if rcount > 1000000: break 71 | try: line = json.loads(line.strip()) 72 | except: continue 73 | for review in line["Reviews"]: 74 | rcount+= 1 75 | if rcount % 100000 == 0: print(rcount) 76 | if rcount % 7 != 0: continue 77 | if "Overall" not in review["Ratings"]: continue 78 | texts.append(review["Content"]) 79 | labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) 80 | if len(texts) % batchsize == 0: 81 | if p != None: p.join() 82 | p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) 83 | p.start() 84 | texts= [] 85 | labels= [] 86 | if p != None: p.join() 87 | self.fit_batch(texts, labels, rcount) 88 | 89 | self.wordbatch.dictionary_freeze= True 90 | 91 | if pickle_model!="": 92 | with gzip.open(pickle_model, 'wb') as model_file: 93 | pkl.dump((self.wordbatch, self.clf), model_file, protocol=2) 94 | 95 | def predict(self, texts): 96 | # if self.sc != None: 97 | # data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc) 98 | # data_rdd= self.wordbatch.transform(data_rdd) 99 | # [counts, labels]= self.wordbatch.rddbatches2lists(data_rdd) 100 | # else: 101 | counts= self.wordbatch.transform(texts) 102 | return self.clf.predict(counts) 103 | 104 | def predict_parallel(self, texts): 105 | # if self.sc != None: 106 | # data_rdd= self.wordbatch.lists2rddbatches([texts, []] , self.sc) 107 | # counts_rdd= self.wordbatch.transform(data_rdd) 108 | # return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0] 109 | counts= self.wordbatch.transform(texts) 110 | return self.wordbatch.predict_parallel(counts, self.clf) 111 | 112 | if __name__ == "__main__": 113 | df= pd.DataFrame.from_csv("../data/Tweets.csv", encoding="utf8") 114 | def sentiment_to_label(sentiment): 115 | if sentiment=="neutral": return 0 116 | if sentiment=="negative": return -1 117 | return 1 118 | 119 | df['airline_sentiment_confidence']= df['airline_sentiment_confidence'].astype('str') 120 | df['sentiment']= (df['airline_sentiment']).apply(lambda x: sentiment_to_label(x)) 121 | df= df[['text','sentiment']] 122 | 123 | re_attags= re.compile(" @[^ ]* ") 124 | re_spaces= re.compile("\w+]") 125 | df['text']= df['text'].apply(lambda x: re_spaces.sub(" ",re_attags.sub(" ", " "+x+" "))[1:-1]) 126 | df= df.drop_duplicates(subset=['text']) 127 | df.index= df['id']= range(df.shape[0]) 128 | 129 | non_alphanums=re.compile('[^A-Za-z]+') 130 | def normalize_text(text): return non_alphanums.sub(' ', text).lower().strip() 131 | df['text_normalized']= df['text'].map(lambda x: normalize_text(x)) 132 | 133 | import wordbag_regressor 134 | print("Train wordbag regressor") 135 | wb_regressor= WordbagRegressor("", "../../../data/tripadvisor/json") 136 | df['wordbag_score']= wb_regressor.predict(df['text'].values) 137 | print(df['wordbag_score'].value_counts()) -------------------------------------------------------------------------------- /scripts/wordhash_regressor.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import json 8 | import gzip 9 | import scipy.sparse as ssp 10 | from wordbatch.pipelines import WordBatch 11 | from wordbatch.extractors import WordHash 12 | from wordbatch.models import FM_FTRL 13 | from wordbatch.transformers import Tokenizer 14 | import threading 15 | import multiprocessing 16 | import sys 17 | if sys.version_info.major == 3: 18 | import pickle as pkl 19 | else: 20 | import cPickle as pkl 21 | 22 | non_alphanums = re.compile('[\W+]') 23 | nums_re= re.compile("\W*[0-9]+\W*") 24 | triples_re= re.compile(r"(\w)\1{2,}") 25 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"), 26 | re.compile(" '"),re.compile(" +")] 27 | from nltk.stem.porter import PorterStemmer 28 | stemmer= PorterStemmer() 29 | def normalize_text(text): 30 | text= text.lower() 31 | text= nums_re.sub(" NUM ", text) 32 | text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1]) 33 | return text 34 | 35 | class BatchData(object): 36 | def __init__(self): 37 | self.texts= None 38 | 39 | class WordhashRegressor(object): 40 | def __init__(self, pickle_model="", datadir=None, batcher= None): 41 | self.wb= WordBatch(normalize_text, tokenizer= Tokenizer(stemmer=stemmer), extractor=WordHash( 42 | decode_error='ignore', n_features=2 ** 25, ngram_range=(1,2), norm='l2'), batcher= batcher) 43 | self.clf = FM_FTRL(D=2 ** 25, D_fm= 4, iters=1, inv_link="identity", threads= multiprocessing.cpu_count()//2) 44 | if datadir==None: (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) 45 | else: self.train(datadir, pickle_model) 46 | 47 | def transform_batch(self, texts, batch_data): 48 | batch_data.texts= self.wb.fit_transform(texts, reset= False) 49 | 50 | def train(self, datadir, pickle_model=""): 51 | texts= [] 52 | labels= [] 53 | training_data= os.listdir(datadir) 54 | rcount= 0 55 | texts2= [] 56 | batchsize= 100000 57 | 58 | batch_data = BatchData() 59 | p_input= None 60 | for jsonfile in training_data: 61 | with open(datadir + "/" + jsonfile, 'r') as inputfile: 62 | for line in inputfile: 63 | # if rcount > 1000000: break 64 | try: line = json.loads(line.strip()) 65 | except: continue 66 | for review in line["Reviews"]: 67 | rcount+= 1 68 | if rcount % 100000 == 0: print(rcount) 69 | if rcount % 9 != 0: continue 70 | if "Overall" not in review["Ratings"]: continue 71 | texts.append(review["Content"]) 72 | labels.append((float(review["Ratings"]["Overall"]) - 3) * 0.5) 73 | if len(texts) % batchsize == 0: 74 | if p_input != None: 75 | p_input.join() 76 | texts2.append(batch_data.texts) 77 | p_input = threading.Thread(target=self.transform_batch, args=(texts, batch_data)) 78 | p_input.start() 79 | texts= [] 80 | if p_input != None: 81 | p_input.join() 82 | texts2.append(batch_data.texts) 83 | texts2.append(self.wb.fit_transform(texts, reset= False)) 84 | del (texts) 85 | if len(texts2) == 1: texts= texts2[0] 86 | else: texts= ssp.vstack(texts2) 87 | 88 | self.wb.dictionary_freeze = True 89 | 90 | self.clf.fit(texts, labels) 91 | if pickle_model != "": 92 | with gzip.open(pickle_model, 'wb') as model_file: 93 | backend= self.wb.batcher.backend 94 | backend_handle= self.wb.batcher.backend_handle 95 | self.wb.batcher.backend= "serial" 96 | self.wb.batcher.backend_handle = None 97 | pkl.dump((self.wb, self.clf), model_file, protocol=2) 98 | self.wb.batcher.backend = backend 99 | self.wb.batcher.backend_handle = backend_handle 100 | 101 | def predict(self, texts): 102 | counts= self.wb.transform(texts) 103 | return self.clf.predict(counts) -------------------------------------------------------------------------------- /scripts/wordseq_regressor.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import pickle as pkl 6 | import gzip 7 | import re 8 | import os 9 | import json 10 | import scipy as sp 11 | import numpy as np 12 | from tensorflow.keras.layers import * 13 | from tensorflow.keras.models import Sequential 14 | from wordbatch.pipelines import WordBatch 15 | from wordbatch.extractors import WordSeq 16 | from wordbatch.transformers import Dictionary 17 | import random 18 | import threading 19 | from tensorflow.keras.models import load_model 20 | import tensorflow as tf 21 | import sys 22 | 23 | non_alphas = re.compile('[^A-Za-z\'-]+') 24 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), 25 | re.compile("[-']{2,}"),re.compile(" '"),re.compile(" +")] 26 | 27 | def normalize_text(text): 28 | text= text.lower() 29 | for x in trash_re: 30 | while x.search(text) != None: text = x.sub(" ", text) 31 | return non_alphas.sub(' ', text).strip() 32 | 33 | class BatchData(object): 34 | def __init__(self): 35 | self.texts= None 36 | self.labels= None 37 | 38 | class WordseqRegressor(): 39 | def __init__(self, pickle_model="", datadir=None, batcher= None): 40 | seed = 10002 41 | os.environ['PYTHONHASHSEED'] = str(seed) 42 | np.random.seed(seed + 1) 43 | random.seed(seed + 2) 44 | tf.random.set_seed(seed+3) 45 | 46 | self.maxlen = 200 47 | self.max_words = 20000 48 | self.wb= WordBatch(normalize_text, dictionary=Dictionary(max_words=self.max_words), 49 | extractor=WordSeq(seq_maxlen=self.maxlen), batcher=batcher) 50 | self.model = Sequential() 51 | self.model.add(Embedding(self.max_words+2, 20, input_length=self.maxlen)) 52 | self.model.add(Conv1D(activation="relu", padding="same", strides=1, filters=10, kernel_size=3)) 53 | self.model.add(Dropout(0.5)) 54 | self.model.add(BatchNormalization()) 55 | self.model.add(GlobalMaxPooling1D()) 56 | self.model.add(Dense(1)) 57 | self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) 58 | if datadir == None: 59 | self.model= load_model(pickle_model) 60 | self.wb= pkl.load(gzip.open(pickle_model + ".wb", 'rb')) 61 | else: self.train(datadir, pickle_model) 62 | 63 | def transform_batch(self, texts, batch_data): 64 | batch_data.texts= self.wb.fit_transform(texts, reset= False) 65 | 66 | def train(self, datadir, pickle_model=""): 67 | texts= [] 68 | labels= [] 69 | training_data = os.listdir(datadir) 70 | rcount= 0 71 | texts2= [] 72 | batchsize= 100000 73 | 74 | batch_data = BatchData() 75 | p_input= None 76 | for jsonfile in training_data: 77 | with open(datadir + "/" + jsonfile, 'r') as inputfile: 78 | for line in inputfile: 79 | #if rcount > 1000000: break 80 | try: line= json.loads(line.strip()) 81 | except: continue 82 | for review in line["Reviews"]: 83 | rcount+= 1 84 | if rcount % 100000 == 0: print(rcount) 85 | if rcount % 8 != 0: continue 86 | if "Overall" not in review["Ratings"]: continue 87 | texts.append(review["Content"]) 88 | labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) 89 | if len(texts) % batchsize == 0: 90 | if p_input != None: 91 | p_input.join() 92 | texts2.append(batch_data.texts) 93 | p_input = threading.Thread(target=self.transform_batch, args=(texts, batch_data)) 94 | p_input.start() 95 | texts= [] 96 | if p_input != None: 97 | p_input.join() 98 | texts2.append(batch_data.texts) 99 | texts2.append(self.wb.partial_fit_transform(texts)) 100 | del(texts) 101 | texts= sp.vstack(texts2) 102 | self.wb.dictionary_freeze = True 103 | test= (np.array(texts[-1000:]), np.array(labels[-1000:])) 104 | train = (np.array(texts[:-1000]), np.array(labels[:-1000])) 105 | 106 | self.model.fit(train[0], train[1], batch_size=2048, epochs=2, validation_data=(test[0], test[1])) 107 | if pickle_model != "": 108 | self.model.save_weights(pickle_model) 109 | backend = self.wb.batcher.backend 110 | backend_handle = self.wb.batcher.backend_handle 111 | self.wb.batcher.backend = "serial" 112 | self.wb.batcher.backend_handle = None 113 | with gzip.open(pickle_model + ".wb", 'wb') as model_file: pkl.dump(self.wb, model_file, protocol=2) 114 | self.wb.batcher.backend = backend 115 | self.wb.batcher.backend_handle = backend_handle 116 | 117 | def predict_batch(self, texts): 118 | results= [x[0] for x in self.model.predict(np.array(self.wb.transform(texts)))] 119 | return results 120 | -------------------------------------------------------------------------------- /scripts/wordvec_regressor.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import json 8 | import gzip 9 | from wordbatch.pipelines import WordBatch 10 | from wordbatch.models import FTRL 11 | from wordbatch.extractors import WordVec, Hstack 12 | from sklearn.utils import shuffle 13 | import threading 14 | import sys 15 | if sys.version_info.major == 3: 16 | import pickle as pkl 17 | else: 18 | import cPickle as pkl 19 | 20 | non_alphanums = re.compile('[\W+]') 21 | nums_re= re.compile("\W*[0-9]+\W*") 22 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"), 23 | re.compile(" '"),re.compile(" +")] 24 | 25 | def normalize_text(text): 26 | text= text.lower() 27 | text= nums_re.sub(" NUM ", text) 28 | text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1]) 29 | return text 30 | 31 | class WordvecRegressor(object): 32 | def __init__(self, pickle_model="", datadir=None, batcher=None): 33 | self.wb= WordBatch(normalize_text, extractor=Hstack([ 34 | WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text, 35 | encoding="utf8"), 36 | WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text, 37 | encoding="utf8")])) 38 | # from wordbatch.pipelines import FeatureUnion 39 | # from wordbatch.transformers import Dictionary, TextNormalizer 40 | # from sklearn.pipeline import Pipeline 41 | # tn= TextNormalizer(normalize_text=normalize_text) 42 | # dct= Dictionary() 43 | # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", 44 | # normalize_text=normalize_text, encoding="utf8", dictionary= dct) 45 | # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", 46 | # normalize_text=normalize_text, encoding="utf8", dictionary= dct) 47 | # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))]) 48 | self.batcher= batcher 49 | 50 | self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100+50, iters=1, inv_link= "identity") 51 | 52 | if datadir==None: (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) 53 | else: self.train(datadir, pickle_model) 54 | 55 | def fit_batch(self, texts, labels, rcount): 56 | texts, labels = shuffle(texts, labels) 57 | print("Transforming", rcount) 58 | #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher) 59 | texts = self.wb.fit_transform(texts) 60 | print("Training", rcount) 61 | self.clf.fit(texts, labels, reset= False) 62 | 63 | def train(self, datadir, pickle_model=""): 64 | texts= [] 65 | labels= [] 66 | training_data= os.listdir(datadir) 67 | rcount= 0 68 | batchsize= 80000 69 | 70 | p= None 71 | for jsonfile in training_data: 72 | with open(datadir + "/" + jsonfile, 'r') as inputfile: 73 | for line in inputfile: 74 | #if rcount > 1000000: break 75 | try: line= json.loads(line.strip()) 76 | except: continue 77 | for review in line["Reviews"]: 78 | rcount+= 1 79 | if rcount % 100000 == 0: print(rcount) 80 | if rcount % 6 != 0: continue 81 | if "Overall" not in review["Ratings"]: continue 82 | texts.append(review["Content"]) 83 | labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) 84 | if len(texts) % batchsize == 0: 85 | if p != None: p.join() 86 | p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) 87 | p.start() 88 | texts= [] 89 | labels= [] 90 | if p != None: p.join() 91 | self.fit_batch(texts, labels, rcount) 92 | 93 | # if pickle_model!="": 94 | # with gzip.open(pickle_model, 'wb') as model_file: 95 | # backend = self.wb.batcher.backend 96 | # backend_handle = self.wb.batcher.backend_handle 97 | # self.wb.batcher.backend = "serial" 98 | # self.wb.batcher.backend_handle = None 99 | # pkl.dump((self.wb, self.clf), model_file, protocol=2) 100 | # self.wb.batcher.backend = backend 101 | # self.wb.batcher.backend_handle = backend_handle 102 | 103 | def predict(self, texts): 104 | vecs= self.wb.transform(texts) 105 | return self.clf.predict(vecs) -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | tag_build = 3 | tag_date = 0 4 | tag_svn_revision = 0 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, Extension 3 | from Cython.Distutils import build_ext 4 | import numpy 5 | import os 6 | 7 | if os.name == 'nt': 8 | extra_compile_args = ["/openmp", "/Ox", "/arch:AVX2", "/fp:fast"] 9 | extra_link_args = [] 10 | else: 11 | extra_compile_args = ["-O3", "-fopenmp", "-ffast-math", "-mavx2", "-ftree-vectorize", "-std=gnu11"] 12 | extra_link_args = ["-fopenmp"] 13 | 14 | setup( 15 | name='Wordbatch', 16 | version='1.4.9', 17 | description='Python library for distributed AI processing pipelines, using swappable scheduler backends', 18 | url='https://github.com/anttttti/Wordbatch', 19 | author='Antti Puurula', 20 | author_email='antti.puurula@yahoo.com', 21 | packages=['wordbatch', 22 | 'wordbatch.pipelines', 23 | 'wordbatch.extractors', 24 | 'wordbatch.models', 25 | 'wordbatch.transformers' 26 | ], 27 | license='GNU GPL 2.0', 28 | classifiers=[ 29 | "Development Status :: 4 - Beta", 30 | "Intended Audience :: Developers", 31 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 32 | "Programming Language :: Python :: 3.6", 33 | "Programming Language :: Python :: 3.7", 34 | "Programming Language :: Python :: 3.8", 35 | "Programming Language :: Python :: 3.9", 36 | "Programming Language :: Python :: 3.10", 37 | "Programming Language :: Cython", 38 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 39 | "Topic :: Software Development :: Libraries :: Python Modules", 40 | ], 41 | install_requires=['Cython', 'scikit-learn', 'python-Levenshtein', 'py-lz4framed', 'randomgen==1.21.2', 'numpy', 42 | 'scipy', 'pandas', 'wheel>=0.33.4'], 43 | extras_require={'dev': ['nltk', 'textblob', 'keras', 'pyspark', 'dask', 'distributed', 'ray']}, 44 | 45 | cmdclass= {'build_ext': build_ext}, 46 | ext_modules= [ 47 | Extension("wordbatch.extractors.extractors", 48 | ["wordbatch/extractors/extractors.pyx", "wordbatch/extractors/MurmurHash3.cpp"], 49 | libraries= [], 50 | include_dirs=[numpy.get_include(), '.'], 51 | extra_compile_args = extra_compile_args, 52 | extra_link_args=extra_link_args), 53 | Extension("wordbatch.models.ftrl", 54 | ["wordbatch/models/ftrl.pyx"], 55 | libraries=[], 56 | include_dirs=[numpy.get_include(), '.'], 57 | extra_compile_args=extra_compile_args, 58 | extra_link_args=extra_link_args), 59 | Extension("wordbatch.models.ftrl32", 60 | ["wordbatch/models/ftrl32.pyx"], 61 | libraries=[], 62 | include_dirs=[numpy.get_include(), '.'], 63 | extra_compile_args=extra_compile_args, 64 | extra_link_args=extra_link_args), 65 | Extension("wordbatch.models.fm_ftrl", 66 | ["wordbatch/models/fm_ftrl.pyx", "wordbatch/models/avx_ext.c"], 67 | libraries= [], 68 | include_dirs=[numpy.get_include(), '.'], 69 | extra_compile_args = extra_compile_args, 70 | extra_link_args=extra_link_args), 71 | Extension("wordbatch.models.nn_relu_h1", 72 | ["wordbatch/models/nn_relu_h1.pyx"], 73 | libraries= [], 74 | include_dirs=[numpy.get_include(), '.'], 75 | extra_compile_args = extra_compile_args, 76 | extra_link_args=extra_link_args), 77 | Extension("wordbatch.models.nn_relu_h2", 78 | ["wordbatch/models/nn_relu_h2.pyx"], 79 | libraries= [], 80 | include_dirs=[numpy.get_include(), '.'], 81 | extra_compile_args = extra_compile_args, 82 | extra_link_args=extra_link_args), 83 | ] 84 | ) 85 | -------------------------------------------------------------------------------- /wordbatch/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) 3 | __version__ = '1.4.9' 4 | 5 | -------------------------------------------------------------------------------- /wordbatch/batcher.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import multiprocessing 7 | from contextlib import closing 8 | import scipy.sparse as ssp 9 | import random 10 | import pandas as pd 11 | from math import ceil 12 | 13 | 14 | class Batcher(object): 15 | """Scheduler to handle parallel jobs on minibatches 16 | 17 | Parameters 18 | ---------- 19 | procs: int 20 | Number of process(es)/thread(s) for executing task in parallel. Used for multiprocessing, threading and Loky 21 | 22 | minibatch_size: int 23 | Expected size of each minibatch 24 | 25 | backend: {'serial', 'multiprocessing', 'threading', 'loky', 'spark', 'dask', 'ray'} 26 | Backend for computing the tasks 27 | 28 | - 'serial' sequential execution without a backend scheduler 29 | 30 | - 'multiprocessing' Python standard multiprocessing library 31 | 32 | - 'threading' Python standard threading library 33 | 34 | - 'loky' Loky fork of multiprocessing library 35 | 36 | - 'spark' PySpark local or distributed execution 37 | 38 | - 'dask' Dask Distributed local or distributed execution 39 | 40 | - 'ray' Ray local or distributed execution 41 | 42 | task_num_cpus: int 43 | Number of CPUs to reserve per minibatch task for Ray 44 | 45 | task_num_gpus: int 46 | Number of GPUs to reserve per minibatch task for Ray 47 | 48 | backend_handle: object 49 | Backend handle for sending tasks 50 | 51 | verbose: int 52 | Verbosity level. Setting verbose > 0 will display additional information depending on the specific level set. 53 | """ 54 | def __init__(self, procs= 0, minibatch_size= 20000, backend_handle= None, backend= "multiprocessing", 55 | task_num_cpus= 1, task_num_gpus= 0, verbose= 0): 56 | if procs == 0 or procs is None: procs= multiprocessing.cpu_count() 57 | self.procs= procs 58 | self.verbose= verbose 59 | self.minibatch_size= minibatch_size 60 | self.backend_handle= backend_handle 61 | self.backend= backend 62 | self.task_num_cpus = task_num_cpus 63 | self.task_num_gpus = task_num_gpus 64 | 65 | def list2indexedrdd(self, lst, minibatch_size=0): 66 | if minibatch_size==0: minibatch_size= self.minibatch_size 67 | start= 0; len_data= len(lst); batch_count= 0 68 | batches= [] 69 | while start < len_data: 70 | batches.append([batch_count]+[lst[start:start + minibatch_size]]) 71 | start+= minibatch_size 72 | batch_count+= 1 73 | return self.backend_handle.parallelize(batches) 74 | 75 | def indexedrdd2list(self, indexedrdd, sort= True): 76 | batches= indexedrdd.collect() 77 | if sort: batches= sorted(batches) 78 | return [batch[1] for batch in batches] 79 | 80 | def split_batches(self, data, minibatch_size= None, backend= None): 81 | """Split data into minibatches with a specified size 82 | 83 | Parameters 84 | ---------- 85 | data: iterable and indexable 86 | List-like data to be split into batches. Includes backend_handleipy matrices and Pandas DataFrames. 87 | 88 | minibatch_size: int 89 | Expected sizes of minibatches split from the data. 90 | 91 | backend: object 92 | Backend to use, instead of the Batcher backend attribute 93 | 94 | Returns 95 | ------- 96 | data_split: list 97 | List of minibatches, each entry is a list-like object representing the data subset in a batch. 98 | """ 99 | if minibatch_size is None: minibatch_size= self.minibatch_size 100 | if backend is None: backend= self.backend 101 | if isinstance(data, list) or isinstance(data, tuple) or isinstance(data, dict): len_data= len(data) 102 | else: len_data= data.shape[0] 103 | if backend=="spark": return self.list2indexedrdd(data, minibatch_size) 104 | if isinstance(data,pd.DataFrame): 105 | data= [data.iloc[x * minibatch_size:(x + 1) * minibatch_size] for x in 106 | range(int(ceil(len_data / minibatch_size)))] 107 | elif isinstance(data, dict): 108 | data = [dict(list(data.items())[x * minibatch_size:min(len_data, (x + 1) * minibatch_size)]) 109 | for x in range(int(ceil(len_data / minibatch_size)))] 110 | else: 111 | data= [data[x* minibatch_size:min(len_data, (x+1)*minibatch_size)] 112 | for x in range(int(ceil(len_data/minibatch_size)))] 113 | ###if backend=="dask": return self.backend_handle.scatter(data) 114 | return data 115 | 116 | def collect_batches(self, data, backend= None, sort= True): 117 | if backend is None: backend= self.backend 118 | if backend == "spark": data= self.indexedrdd2list(data, sort) 119 | if backend == "dask": data = self.backend_handle.gather(data) 120 | return data 121 | 122 | def merge_batches(self, data): 123 | """Merge a list of data minibatches into one single instance representing the data 124 | 125 | Parameters 126 | ---------- 127 | data: list 128 | List of minibatches to merge 129 | 130 | Returns 131 | ------- 132 | (anonymous): sparse matrix | pd.DataFrame | list 133 | Single complete list-like data merged from given batches 134 | """ 135 | if isinstance(data[0], ssp.csr_matrix): return ssp.vstack(data) 136 | if isinstance(data[0], pd.DataFrame) or isinstance(data[0], pd.Series): return pd.concat(data) 137 | return [item for sublist in data for item in sublist] 138 | 139 | def process_batches(self, task, data, args, backend=None, backend_handle=None, input_split=False, 140 | merge_output= True, minibatch_size= None, procs=None, task_num_cpus= None, 141 | task_num_gpus= None, verbose= None): 142 | """ 143 | 144 | Parameters 145 | ---------- 146 | task: function 147 | Function to apply on each minibatch with other specified arguments 148 | 149 | data: list-like 150 | Samples to split into minibatches and apply the specified function on 151 | 152 | args: list 153 | Arguments to pass to the specified function following the mini-batch 154 | 155 | input_split: boolean, default False 156 | If True, input data is already mapped into minibatches, otherwise data will be split on call. 157 | 158 | merge_output: boolean, default True 159 | If True, results from minibatches will be reduced into one single instance before return. 160 | 161 | procs: int 162 | Number of process(es)/thread(s) for executing task in parallel. Used for multiprocessing, threading, 163 | Loky and Ray 164 | 165 | minibatch_size: int 166 | Expected size of each minibatch 167 | 168 | backend: {'serial', 'multiprocessing', 'threading', 'loky', 'spark', 'dask', 'ray'} 169 | Backend for computing the tasks 170 | 171 | - 'serial' sequential execution without a backend scheduler 172 | 173 | - 'multiprocessing' Python standard multiprocessing library 174 | 175 | - 'threading' Python standard threading library 176 | 177 | - 'loky' Loky fork of multiprocessing library 178 | 179 | - 'spark' PySpark local or distributed execution 180 | 181 | - 'dask' Dask Distributed local or distributed execution 182 | 183 | - 'ray' Ray local or distributed execution 184 | 185 | backend_handle: object 186 | Backend handle for sending tasks 187 | 188 | task_num_cpus: int 189 | Number of CPUs to reserve per minibatch task for Ray 190 | 191 | task_num_gpus: int 192 | Number of GPUs to reserve per minibatch task for Ray 193 | 194 | verbose: int 195 | Verbosity level. Setting verbose > 0 will display additional information depending on the specific level set. 196 | 197 | Returns 198 | ------- 199 | results: list-like | list of list-like 200 | If merge_output is specified as True, this will be a list-like object representing 201 | the dataset, with each entry as a sample. Otherwise this will be a list of list-like 202 | objects, with each entry representing the results from a minibatch. 203 | """ 204 | if procs is None: procs= self.procs 205 | if backend is None: backend= self.backend 206 | if backend_handle is None: backend_handle = self.backend_handle 207 | if task_num_cpus is None: task_num_cpus = self.task_num_cpus 208 | if task_num_gpus is None: task_num_gpus = self.task_num_gpus 209 | if verbose is None: verbose= self.verbose 210 | if verbose > 1: 211 | print("Task:", task, " backend:", backend, " backend_handle:", backend_handle, " procs:", 212 | procs, " input_split:", input_split, " merge_output:", merge_output) 213 | 214 | if verbose> 10: 215 | print("len(data):", len(data), "len(args):", len(args), "[type(x) for x in data]:", 216 | [type(x) for x in data], "[type(x) for x in args]:", [type(x) for x in args]) 217 | 218 | if not(input_split): 219 | if backend=="spark": 220 | paral_params= self.split_batches(data, minibatch_size, backend="spark") 221 | else: 222 | paral_params= [[data_batch]+ args for data_batch in self.split_batches(data, minibatch_size)] 223 | else: 224 | if backend!="spark": paral_params= [[data_batch]+ args for data_batch in data] 225 | else: paral_params= data 226 | if verbose > 1: print("Start task, len(paral_params)", len(paral_params)) 227 | if backend == "serial": 228 | results = [task(minibatch) for minibatch in paral_params] 229 | else: 230 | if backend=="multiprocessing": 231 | with closing(multiprocessing.Pool(max(1, procs), maxtasksperchild=2)) as pool: 232 | results = pool.map_async(task, paral_params) 233 | pool.close() 234 | pool.join() 235 | results= results.get() 236 | elif backend=="threading": 237 | with closing(multiprocessing.dummy.Pool(max(1,procs))) as pool: 238 | results= pool.map(task, paral_params) 239 | pool.close() 240 | pool.join() 241 | if backend=="loky": 242 | from loky import get_reusable_executor 243 | pool= get_reusable_executor(max_workers=max(1, procs)) 244 | results= list(pool.map(task, paral_params)) 245 | elif backend == "dask": 246 | ###if not (input_split): data= self.scatter(data) 247 | results = [self.backend_handle.submit(task, params) for params in paral_params] 248 | elif backend == "spark": 249 | def apply_func_to_indexedrdd(batch): 250 | return [batch[0]] + [task([batch[1]] + args)] 251 | results = paral_params.map(apply_func_to_indexedrdd) 252 | elif backend == "ray": 253 | @self.backend_handle.remote(num_cpus=task_num_cpus, num_gpus=task_num_gpus) 254 | def f_ray(f, data): 255 | return f(data) 256 | results = [f_ray.remote(task, paral_params.pop(0)) for _ in range(min(len(paral_params), self.procs))] 257 | uncompleted = results 258 | while (len(paral_params) > 0): 259 | # More tasks than available processors. Queue the task calls 260 | done, remaining = self.backend_handle.wait(uncompleted, timeout=60, fetch_local=False) 261 | if len(done) == 0: continue 262 | done= done[0] 263 | uncompleted = [x for x in uncompleted if x != done] 264 | if len(remaining) > 0: 265 | new = f_ray.remote(task, paral_params.pop(0)) 266 | uncompleted.append(new) 267 | results.append(new) 268 | results = [self.backend_handle.get(x) for x in results] 269 | #ppft currently not supported. Supporting arbitrary tasks requires modifications to passed arguments 270 | #elif backend == "ppft": 271 | # jobs = [self.backend_handle.submit(task, (x,), (), ()) for x in paral_params] 272 | # results = [x() for x in jobs] 273 | 274 | if merge_output: return self.merge_batches(self.collect_batches(results, backend=backend)) 275 | if verbose > 2: 276 | print("Task:", task, " backend:", backend, " backend_handle:", backend_handle, " completed") 277 | return results 278 | 279 | def shuffle_batch(self, texts, labels= None, seed= None): 280 | """Shuffle a list of samples, as well as the labels if specified 281 | 282 | Parameters 283 | ---------- 284 | texts: list-like 285 | List of samples to shuffle 286 | 287 | labels: list-like (optional) 288 | List of labels to shuffle, should be correspondent to the samples given 289 | 290 | seed: int 291 | The seed of the pseudo random number generator to use for shuffling 292 | 293 | Returns 294 | ------- 295 | texts: list 296 | List of shuffled samples (texts parameters) 297 | 298 | labels: list (optional) 299 | List of shuffled labels. This will only be returned when non-None 300 | labels is passed 301 | """ 302 | if seed != None: random.seed(seed) 303 | index_shuf= list(range(len(texts))) 304 | random.shuffle(index_shuf) 305 | texts= [texts[x] for x in index_shuf] 306 | if labels == None: return texts 307 | labels= [labels[x] for x in index_shuf] 308 | return texts, labels 309 | 310 | def __getstate__(self): 311 | return dict((k, v) for (k, v) in self.__dict__.items()) 312 | 313 | def __setstate__(self, params): 314 | for key in params: setattr(self, key, params[key]) 315 | -------------------------------------------------------------------------------- /wordbatch/data_utils.py: -------------------------------------------------------------------------------- 1 | import randomgen 2 | import numpy as np 3 | import time 4 | import multiprocessing 5 | from contextlib import contextmanager 6 | from functools import partial 7 | from multiprocessing.pool import ThreadPool 8 | import itertools 9 | import scipy.sparse as ssp 10 | 11 | # @contextmanager 12 | # def timer(name): 13 | # t0 = time.time() 14 | # yield 15 | # print(name + " done in " + str(time.time() - t0) + "s") 16 | # 17 | # def shuffle(*objects, seed=0): 18 | # #Faster than inplace, but uses more memory 19 | # if isinstance(objects[0], ssp.base.spmatrix): lenn= objects[0].shape[0] 20 | # else: lenn= len(objects[0]) 21 | # shuffled= randomgen.xoroshiro128.Xoroshiro128(seed).generator.permutation(lenn) 22 | # return [[x[z] for z in shuffled] if type(x)==list else x[shuffled] for x in objects] 23 | # 24 | # def inplace_shuffle(*objects, seed=0): 25 | # #Slower than shuffle, but uses no extra memory 26 | # rand = randomgen.xoroshiro128.Xoroshiro128(seed).generator 27 | # for x in objects: 28 | # rand.seed(seed) 29 | # rand.shuffle(x) 30 | # 31 | # def inplace_shuffle_threaded(*objects, threads= 0, seed=0): 32 | # #Faster than inplace for very large array sizes, > 10000000 33 | # if threads== 0: threads= min(len(objects), multiprocessing.cpu_count()) 34 | # with ThreadPool(processes=threads) as pool: 35 | # pool.map(partial(inplace_shuffle, seed=seed), objects) 36 | 37 | def indlist2csrmatrix(indlist, datalist= None, shape= None): 38 | #Convert a list of indicator lists to a scipy.sparse.csr_matrix 39 | indptr= [0] 40 | c= 0 41 | for x in indlist: 42 | c+= len(x) 43 | indptr.append(c) 44 | indices = list(itertools.chain.from_iterable(indlist)) 45 | if datalist is not None: 46 | data= list(itertools.chain.from_iterable(datalist)) 47 | else: 48 | data= np.ones(len(indices), dtype=np.float64) 49 | if shape==None: shape= (len(indlist), max(indices)+1) 50 | X= ssp.csr_matrix((data, indices, indptr), shape=shape) 51 | return X 52 | 53 | # x= np.array(range(10000000)) 54 | # y= np.array(range(10000000)) 55 | # 56 | # print(x) 57 | # print(y) 58 | # 59 | # with timer('shuffle'): 60 | # for z in range(10): 61 | # x, y= shuffle(x,y) 62 | # print(x) 63 | # print(y) 64 | # 65 | # with timer('inplace_shuffle'): 66 | # for z in range(10): 67 | # inplace_shuffle(x,y) 68 | # print(x) 69 | # print(y) 70 | # 71 | # with timer('inplace_shuffle_threaded'): 72 | # for z in range(10): 73 | # inplace_shuffle_threaded(x,y) 74 | # print(x) 75 | # print(y) 76 | # 77 | # from sklearn.utils import shuffle as shuffle2 78 | # with timer('sklearn_shuffle'): 79 | # for z in range(10): 80 | # x, y= shuffle2(x,y) 81 | # print(x) 82 | # print(y) 83 | -------------------------------------------------------------------------------- /wordbatch/extractors/MurmurHash3.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the 6 | // algorithms are optimized for their respective platforms. You can still 7 | // compile and run any of them on any platform, but your performance with the 8 | // non-native version will be less than optimal. 9 | 10 | #include "MurmurHash3.h" 11 | 12 | //----------------------------------------------------------------------------- 13 | // Platform-specific functions and macros 14 | 15 | // Microsoft Visual Studio 16 | 17 | #if defined(_MSC_VER) 18 | 19 | #define FORCE_INLINE __forceinline 20 | 21 | #include 22 | 23 | #define ROTL32(x,y) _rotl(x,y) 24 | #define ROTL64(x,y) _rotl64(x,y) 25 | 26 | #define BIG_CONSTANT(x) (x) 27 | 28 | // Other compilers 29 | 30 | #else // defined(_MSC_VER) 31 | 32 | #if defined(GNUC) && ((GNUC > 4) || (GNUC == 4 && GNUC_MINOR >= 4)) 33 | 34 | /* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. 35 | * Don't inline for RHEL 5 gcc which is 4.1 */ 36 | #define FORCE_INLINE attribute((always_inline)) 37 | 38 | #else 39 | 40 | #define FORCE_INLINE 41 | 42 | #endif 43 | 44 | 45 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 46 | { 47 | return (x << r) | (x >> (32 - r)); 48 | } 49 | 50 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 51 | { 52 | return (x << r) | (x >> (64 - r)); 53 | } 54 | 55 | #define ROTL32(x,y) rotl32(x,y) 56 | #define ROTL64(x,y) rotl64(x,y) 57 | 58 | #define BIG_CONSTANT(x) (x##LLU) 59 | 60 | #endif // !defined(_MSC_VER) 61 | 62 | //----------------------------------------------------------------------------- 63 | // Block read - if your platform needs to do endian-swapping or can only 64 | // handle aligned reads, do the conversion here 65 | 66 | FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) 67 | { 68 | return p[i]; 69 | } 70 | 71 | FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) 72 | { 73 | return p[i]; 74 | } 75 | 76 | //----------------------------------------------------------------------------- 77 | // Finalization mix - force all bits of a hash block to avalanche 78 | 79 | FORCE_INLINE uint32_t fmix ( uint32_t h ) 80 | { 81 | h ^= h >> 16; 82 | h *= 0x85ebca6b; 83 | h ^= h >> 13; 84 | h *= 0xc2b2ae35; 85 | h ^= h >> 16; 86 | 87 | return h; 88 | } 89 | 90 | //---------- 91 | 92 | FORCE_INLINE uint64_t fmix ( uint64_t k ) 93 | { 94 | k ^= k >> 33; 95 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 96 | k ^= k >> 33; 97 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 98 | k ^= k >> 33; 99 | 100 | return k; 101 | } 102 | 103 | //----------------------------------------------------------------------------- 104 | 105 | void MurmurHash3_x86_32 ( const void * key, int len, 106 | uint32_t seed, void * out ) 107 | { 108 | const uint8_t * data = (const uint8_t*)key; 109 | const int nblocks = len / 4; 110 | 111 | uint32_t h1 = seed; 112 | 113 | uint32_t c1 = 0xcc9e2d51; 114 | uint32_t c2 = 0x1b873593; 115 | 116 | //---------- 117 | // body 118 | 119 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 120 | 121 | for(int i = -nblocks; i; i++) 122 | { 123 | uint32_t k1 = getblock(blocks,i); 124 | 125 | k1 *= c1; 126 | k1 = ROTL32(k1,15); 127 | k1 *= c2; 128 | 129 | h1 ^= k1; 130 | h1 = ROTL32(h1,13); 131 | h1 = h1*5+0xe6546b64; 132 | } 133 | 134 | //---------- 135 | // tail 136 | 137 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 138 | 139 | uint32_t k1 = 0; 140 | 141 | switch(len & 3) 142 | { 143 | case 3: k1 ^= tail[2] << 16; 144 | case 2: k1 ^= tail[1] << 8; 145 | case 1: k1 ^= tail[0]; 146 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 147 | }; 148 | 149 | //---------- 150 | // finalization 151 | 152 | h1 ^= len; 153 | 154 | h1 = fmix(h1); 155 | 156 | *(uint32_t*)out = h1; 157 | } 158 | 159 | //----------------------------------------------------------------------------- 160 | 161 | void MurmurHash3_x86_128 ( const void * key, const int len, 162 | uint32_t seed, void * out ) 163 | { 164 | const uint8_t * data = (const uint8_t*)key; 165 | const int nblocks = len / 16; 166 | 167 | uint32_t h1 = seed; 168 | uint32_t h2 = seed; 169 | uint32_t h3 = seed; 170 | uint32_t h4 = seed; 171 | 172 | uint32_t c1 = 0x239b961b; 173 | uint32_t c2 = 0xab0e9789; 174 | uint32_t c3 = 0x38b34ae5; 175 | uint32_t c4 = 0xa1e38b93; 176 | 177 | //---------- 178 | // body 179 | 180 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 181 | 182 | for(int i = -nblocks; i; i++) 183 | { 184 | uint32_t k1 = getblock(blocks,i*4+0); 185 | uint32_t k2 = getblock(blocks,i*4+1); 186 | uint32_t k3 = getblock(blocks,i*4+2); 187 | uint32_t k4 = getblock(blocks,i*4+3); 188 | 189 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 190 | 191 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 192 | 193 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 194 | 195 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 196 | 197 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 198 | 199 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 200 | 201 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 202 | 203 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 204 | } 205 | 206 | //---------- 207 | // tail 208 | 209 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 210 | 211 | uint32_t k1 = 0; 212 | uint32_t k2 = 0; 213 | uint32_t k3 = 0; 214 | uint32_t k4 = 0; 215 | 216 | switch(len & 15) 217 | { 218 | case 15: k4 ^= tail[14] << 16; 219 | case 14: k4 ^= tail[13] << 8; 220 | case 13: k4 ^= tail[12] << 0; 221 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 222 | 223 | case 12: k3 ^= tail[11] << 24; 224 | case 11: k3 ^= tail[10] << 16; 225 | case 10: k3 ^= tail[ 9] << 8; 226 | case 9: k3 ^= tail[ 8] << 0; 227 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 228 | 229 | case 8: k2 ^= tail[ 7] << 24; 230 | case 7: k2 ^= tail[ 6] << 16; 231 | case 6: k2 ^= tail[ 5] << 8; 232 | case 5: k2 ^= tail[ 4] << 0; 233 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 234 | 235 | case 4: k1 ^= tail[ 3] << 24; 236 | case 3: k1 ^= tail[ 2] << 16; 237 | case 2: k1 ^= tail[ 1] << 8; 238 | case 1: k1 ^= tail[ 0] << 0; 239 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 240 | }; 241 | 242 | //---------- 243 | // finalization 244 | 245 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 246 | 247 | h1 += h2; h1 += h3; h1 += h4; 248 | h2 += h1; h3 += h1; h4 += h1; 249 | 250 | h1 = fmix(h1); 251 | h2 = fmix(h2); 252 | h3 = fmix(h3); 253 | h4 = fmix(h4); 254 | 255 | h1 += h2; h1 += h3; h1 += h4; 256 | h2 += h1; h3 += h1; h4 += h1; 257 | 258 | ((uint32_t*)out)[0] = h1; 259 | ((uint32_t*)out)[1] = h2; 260 | ((uint32_t*)out)[2] = h3; 261 | ((uint32_t*)out)[3] = h4; 262 | } 263 | 264 | //----------------------------------------------------------------------------- 265 | 266 | void MurmurHash3_x64_128 ( const void * key, const int len, 267 | const uint32_t seed, void * out ) 268 | { 269 | const uint8_t * data = (const uint8_t*)key; 270 | const int nblocks = len / 16; 271 | 272 | uint64_t h1 = seed; 273 | uint64_t h2 = seed; 274 | 275 | uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 276 | uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 277 | 278 | //---------- 279 | // body 280 | 281 | const uint64_t * blocks = (const uint64_t *)(data); 282 | 283 | for(int i = 0; i < nblocks; i++) 284 | { 285 | uint64_t k1 = getblock(blocks,i*2+0); 286 | uint64_t k2 = getblock(blocks,i*2+1); 287 | 288 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 289 | 290 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 291 | 292 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 293 | 294 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 295 | } 296 | 297 | //---------- 298 | // tail 299 | 300 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 301 | 302 | uint64_t k1 = 0; 303 | uint64_t k2 = 0; 304 | 305 | switch(len & 15) 306 | { 307 | case 15: k2 ^= uint64_t(tail[14]) << 48; 308 | case 14: k2 ^= uint64_t(tail[13]) << 40; 309 | case 13: k2 ^= uint64_t(tail[12]) << 32; 310 | case 12: k2 ^= uint64_t(tail[11]) << 24; 311 | case 11: k2 ^= uint64_t(tail[10]) << 16; 312 | case 10: k2 ^= uint64_t(tail[ 9]) << 8; 313 | case 9: k2 ^= uint64_t(tail[ 8]) << 0; 314 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 315 | 316 | case 8: k1 ^= uint64_t(tail[ 7]) << 56; 317 | case 7: k1 ^= uint64_t(tail[ 6]) << 48; 318 | case 6: k1 ^= uint64_t(tail[ 5]) << 40; 319 | case 5: k1 ^= uint64_t(tail[ 4]) << 32; 320 | case 4: k1 ^= uint64_t(tail[ 3]) << 24; 321 | case 3: k1 ^= uint64_t(tail[ 2]) << 16; 322 | case 2: k1 ^= uint64_t(tail[ 1]) << 8; 323 | case 1: k1 ^= uint64_t(tail[ 0]) << 0; 324 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 325 | }; 326 | 327 | //---------- 328 | // finalization 329 | 330 | h1 ^= len; h2 ^= len; 331 | 332 | h1 += h2; 333 | h2 += h1; 334 | 335 | h1 = fmix(h1); 336 | h2 = fmix(h2); 337 | 338 | h1 += h2; 339 | h2 += h1; 340 | 341 | ((uint64_t*)out)[0] = h1; 342 | ((uint64_t*)out)[1] = h2; 343 | } 344 | 345 | //----------------------------------------------------------------------------- 346 | 347 | -------------------------------------------------------------------------------- /wordbatch/extractors/MurmurHash3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned long uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | 33 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 36 | 37 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 38 | 39 | #ifdef __cplusplus 40 | } 41 | #endif 42 | 43 | //----------------------------------------------------------------------------- 44 | 45 | #endif // _MURMURHASH3_H_ 46 | -------------------------------------------------------------------------------- /wordbatch/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .extractors import * 2 | -------------------------------------------------------------------------------- /wordbatch/extractors/extractors.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: boundscheck=False, infer_types=True, wraparound=False, cdivision=True 3 | from __future__ import with_statement 4 | from __future__ import division 5 | from __future__ import absolute_import 6 | from __future__ import print_function 7 | from sklearn.utils.murmurhash import murmurhash3_32 8 | from sklearn.feature_extraction.text import HashingVectorizer 9 | #from nltk.metrics import edit_distance 10 | import scipy.sparse as ssp 11 | import scipy as sp 12 | import numpy as np 13 | import gzip 14 | import lz4framed 15 | import array 16 | from wordbatch.data_utils import indlist2csrmatrix 17 | from cpython cimport array 18 | cimport cython 19 | from libc.stdlib cimport abs 20 | from libc.math cimport log, fabs 21 | cimport numpy as np 22 | 23 | np.import_array() 24 | 25 | cdef extern: 26 | void MurmurHash3_x86_32(void *key, int len, np.uint32_t seed, void *out) 27 | 28 | cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed= 0): 29 | cdef np.int32_t out 30 | MurmurHash3_x86_32( key, len(key), seed, &out) 31 | return out 32 | 33 | def save_to_lz4(file, input, dtype, level= 0): 34 | with open(file, 'wb') as f: f.write(lz4framed.compress(np.array(input, dtype=dtype).tostring(), level)) 35 | 36 | def load_from_lz4(file, dtype): 37 | with open(file, 'rb') as f: input= np.fromstring(lz4framed.decompress(f.read()), dtype=dtype) 38 | return input 39 | 40 | def csr_to_lz4(file, features): 41 | save_to_lz4(file, features.indptr, dtype=int) 42 | save_to_lz4(file+".i", features.indices, dtype=int) 43 | save_to_lz4(file+".d", features.data, dtype=np.float64) 44 | 45 | def lz4_to_csr(file): 46 | indptr= load_from_lz4(file, int) 47 | indices= load_from_lz4(file+".i", int) 48 | data= load_from_lz4(file+".d", np.float64) 49 | return ssp.csr_matrix((data, indices, indptr)) 50 | 51 | def batch_transform(args): 52 | return args[1].batch_transform(args[0]) 53 | 54 | cdef class TextRow: 55 | cdef list indices, data 56 | cdef dict fea_weights 57 | 58 | def __init__(self): 59 | self.indices= [] 60 | self.data= [] 61 | self.fea_weights= {} 62 | 63 | cdef append(self, int index, int value, float weight): 64 | self.indices.append(index) 65 | self.data.append(value) 66 | self.fea_weights[index]= weight 67 | 68 | class WordBag: 69 | def __init__(self, *args, **kwargs): 70 | self.dictionary= kwargs.get('dictionary', None) 71 | kwargs.setdefault("norm", 'l2') 72 | kwargs.setdefault("tf", 'log') 73 | kwargs.setdefault("idf", 0.0) 74 | kwargs.setdefault("hash_ngrams", 0) 75 | kwargs.setdefault("hash_ngrams_weights", None) 76 | kwargs.setdefault("hash_size", 10000000) 77 | kwargs.setdefault("hash_polys_window", 0) 78 | kwargs.setdefault("hash_polys_mindf", 5) 79 | kwargs.setdefault("hash_polys_maxdf", 0.5) 80 | kwargs.setdefault("hash_polys_weight", 0.1) 81 | kwargs.setdefault("seed", 0) 82 | for key, value in kwargs.items(): setattr(self, key.lower(), value) 83 | if self.hash_ngrams_weights is None: self.hash_ngrams_weights= [1.0 for _ in range(self.hash_ngrams)] 84 | 85 | def transform_single(self, text): 86 | dft= self.dictionary.dft 87 | word2id= self.dictionary.word2id 88 | cdef int fc_hash_ngrams= self.hash_ngrams, word_id, df= 1, df2, hashed, doc_count= self.dictionary.doc_count, \ 89 | use_idf= 0, seed= self.seed 90 | cdef float idf_lift= 0.0, idf= 1.0, weight, norm= 1.0, norm_idf= 1.0 91 | if self.idf is not None: 92 | use_idf= True 93 | idf_lift= self.idf 94 | norm_idf= 1.0 / log(max(1.0, idf_lift + doc_count)) 95 | cdef int fc_hash_size= self.hash_size 96 | if self.hash_ngrams == 0: hash_size = self.dictionary.max_words 97 | fc_hash_ngrams_weights= self.hash_ngrams_weights 98 | fc_tf= self.tf 99 | fc_norm= self.norm 100 | cdef int fc_hash_polys_window= self.hash_polys_window, fc_hash_polys_mindf= self.hash_polys_mindf 101 | cdef float fc_hash_polys_maxdf= self.hash_polys_maxdf, fc_hash_polys_weight= self.hash_polys_weight 102 | 103 | text= text.split(" ") 104 | cdef TextRow textrow= TextRow() 105 | for x in range(len(text)): 106 | word= text[x] 107 | if word2id is not None: 108 | word_id = word2id.get(word, -1) 109 | if word_id == -1 or word_id>= fc_hash_size: continue 110 | df= dft.get(word, 0) 111 | if use_idf: 112 | if df == 0: continue 113 | idf= log(max(1.0, idf_lift + doc_count / df)) * norm_idf 114 | #print(word, idf, df, log(max(1.0, idf_lift + doc_count / df)), norm_idf) 115 | if idf== 0.0: continue 116 | 117 | if fc_hash_ngrams==0: textrow.append(word_id, 1, idf) 118 | 119 | for y in range(min(fc_hash_ngrams, x+1)): 120 | hashed= murmurhash3_bytes_s32((" ".join(text[x-y:x+1])).encode("utf-8"), seed) 121 | weight= fc_hash_ngrams_weights[y] 122 | if weight < 0: weight*= -idf 123 | textrow.append(abs(hashed) % fc_hash_size, (hashed >= 0) * 2 - 1, weight) 124 | 125 | if fc_hash_polys_window!=0: 126 | if doc_count!=0: 127 | if df< fc_hash_polys_mindf or float(df)/self.dictionary.doc_count> fc_hash_polys_maxdf: continue 128 | #for y from max(1, fc_hash_ngrams) <= y < min(fc_hash_polys_window, x+1): 129 | for y in range(1, min(fc_hash_polys_window, x+1)): 130 | word2= text[x-y] 131 | if doc_count!=0: 132 | df2= dft[word2] 133 | if df2< fc_hash_polys_mindf or float(df2)/self.dictionary.doc_count> fc_hash_polys_maxdf: 134 | continue 135 | hashed= murmurhash3_bytes_s32((word+"#"+word2).encode("utf-8"), seed) if word= 0) * 2 - 1, weight 141 | textrow.append(abs(hashed) % fc_hash_size, (hashed >= 0) * 2 - 1, weight) 142 | 143 | cdef np.int32_t size= len(textrow.data) 144 | wordbag= ssp.csr_matrix((textrow.data, textrow.indices, array.array("i", ([0, size]))), 145 | shape=(1, fc_hash_size), dtype=np.float64) 146 | wordbag.sum_duplicates() 147 | 148 | if fc_tf== 'log': wordbag.data= np.log(1.0+np.abs(wordbag.data)) *np.sign(wordbag.data) 149 | elif fc_tf== 'binary': np.sign(wordbag.data, out=wordbag.data) 150 | elif type(fc_tf)== type(1.0): 151 | wordbag.data= ((fc_tf+1.0)*np.abs(wordbag.data))/(fc_tf+np.abs(wordbag.data))*np.sign(wordbag.data) 152 | 153 | size= wordbag.data.shape[0] 154 | fea_weights= textrow.fea_weights 155 | cdef int [:] indices_view= wordbag.indices 156 | cdef double [:] data_view= wordbag.data 157 | 158 | for x in range(size): data_view[x]*= fea_weights[indices_view[x]] 159 | 160 | if fc_norm== 'l0': norm= size 161 | elif fc_norm== 'l1': norm= np.sum(np.abs(data_view)) 162 | elif fc_norm== 'l2': norm= np.sqrt(np.sum([w*w for w in data_view])) 163 | if norm != 0.0: norm= 1.0 / norm 164 | if fc_norm is not None: wordbag.data*= norm 165 | return wordbag 166 | 167 | def transform(self, texts, y= None):#input_split= False, merge_output= True, batcher= None): 168 | return ssp.vstack([self.transform_single(text) for text in texts]) 169 | 170 | def fit(self, texts, y= None): 171 | return self 172 | 173 | def fit_transform(self, texts, y=None): 174 | return self.transform(texts, y=None) 175 | 176 | def save_features(self, file, features): 177 | csr_to_lz4(file, features) 178 | 179 | def load_features(self, file): 180 | return lz4_to_csr(file) 181 | 182 | class WordHash: 183 | def __init__(self, *args, **kwargs): 184 | if "dictionary" in kwargs: kwargs.pop("dictionary") 185 | self.hv = HashingVectorizer(*args, **kwargs) 186 | 187 | def transform(self, texts, y=None): 188 | return self.hv.transform(texts) 189 | 190 | def fit(self, texts, y=None): 191 | return self 192 | 193 | def fit_transform(self, texts, y=None): 194 | return self.transform(texts, y=None) 195 | 196 | def save_features(self, file, features): 197 | csr_to_lz4(file, features) 198 | 199 | def load_features(self, file): 200 | return lz4_to_csr(file) 201 | 202 | 203 | class WordSeq: 204 | def __init__(self, *args, **kwargs): 205 | self.dictionary = kwargs.get('dictionary', None) 206 | kwargs.setdefault("seq_maxlen", None) 207 | kwargs.setdefault("seq_padstart", True) 208 | kwargs.setdefault("seq_truncstart", True) 209 | kwargs.setdefault("remove_oovs", False) 210 | kwargs.setdefault("pad_id", 0) 211 | for key, value in kwargs.items(): setattr(self, key.lower(), value) 212 | 213 | def transform_single(self, text): 214 | word2id= self.dictionary.word2id 215 | oov_id= self.dictionary.max_words+1 216 | if self.remove_oovs: wordseq= [word2id[word] for word in text.split(" ") if word in word2id] 217 | else: wordseq= [word2id.get(word, oov_id) for word in text.split(" ")] 218 | if self.seq_maxlen is not None: 219 | if len(wordseq) > self.seq_maxlen: 220 | if self.seq_truncstart: wordseq= wordseq[-self.seq_maxlen:] 221 | else: wordseq= wordseq[:self.seq_maxlen] 222 | else: 223 | if self.seq_padstart== True: wordseq= [self.pad_id] * (self.seq_maxlen - len(wordseq)) + wordseq 224 | else: wordseq+= [self.pad_id] * (self.seq_maxlen - len(wordseq)) 225 | return wordseq 226 | 227 | def transform(self, texts, y= None): 228 | return [self.transform_single(text) for text in texts] 229 | 230 | def fit(self, texts, y=None): 231 | return self 232 | 233 | def fit_transform(self, texts, y=None): 234 | return self.transform(texts, y=None) 235 | 236 | def save_features(self, file, features): 237 | save_to_lz4(file, features, dtype=int) 238 | i= 0 239 | indices= [] 240 | for x in features: 241 | i+= len(x) 242 | indices.append(i) 243 | save_to_lz4(file + ".i", indices, dtype=int) 244 | 245 | def load_features(self, file): 246 | words= load_from_lz4(file, int).tolist() 247 | indices= [0]+load_from_lz4(file + ".i", int).tolist() 248 | return [words[indices[i]:indices[i+1]] for i in range(len(indices)-1)] 249 | 250 | 251 | class WordVec: 252 | def __init__(self, *args, **kwargs): 253 | self.dictionary = kwargs.get('dictionary', None) 254 | kwargs.setdefault("normalize_text", None) 255 | kwargs.setdefault("stemmer", None) 256 | kwargs.setdefault("merge_dict", True) 257 | kwargs.setdefault("normalize_dict", False) 258 | kwargs.setdefault("verbose", 0) 259 | kwargs.setdefault("merge_vectors", "mean") 260 | kwargs.setdefault("normalize_merged", "l2") 261 | kwargs.setdefault("encoding", "utf8") 262 | kwargs.setdefault("shrink_model_transform", True) 263 | kwargs.setdefault("w2v_dim", None) 264 | for key, value in kwargs.items(): setattr(self, key.lower(), value) 265 | if "w2v_model" in kwargs: self.w2v= kwargs["w2v_model"] 266 | else: self.w2v= self.load_w2v(kwargs["wordvec_file"], kwargs['encoding'], kwargs['w2v_dim']) 267 | self.w2v_dim= len(list(self.w2v.values())[0]) 268 | 269 | def load_w2v(self, w2v_file, encoding= "ISO-8859-1", w2v_dim= None): 270 | w2v= {} 271 | from collections import Counter 272 | w2v_counts= Counter() 273 | opn= gzip.open if w2v_file.endswith(".gz") else open 274 | for line in opn(w2v_file, 'rb'): 275 | line= line.decode(encoding).strip().split(" ", 1) 276 | vec= np.array([np.float64(x) for x in line[1].split(" ")]) 277 | if len(vec)<2: continue 278 | if w2v_dim is not None and len(vec)!=w2v_dim: 279 | print("Wrong vector length", len(vec),", should be:", w2v_dim, ":", line) 280 | continue 281 | word= line[0] 282 | if self.normalize_text is not None: word= self.normalize_text(word) 283 | if self.stemmer is not None: word= self.stemmer.stem(word) 284 | if not(self.merge_dict): w2v[word]= vec 285 | else: 286 | w2v_counts[word] += 1 287 | if word in w2v: 288 | w2v[word]+= (vec - w2v[word]) / w2v_counts[word] 289 | if self.verbose>0: 290 | print("Merged entry:", word, w2v_counts[word]) 291 | else: w2v[word]= vec 292 | if self.normalize_dict!=False: 293 | for word in w2v: 294 | if self.normalize_dict=="l1": 295 | norm= sum(np.abs(w2v[word])) 296 | else: 297 | norm = np.sqrt(sum(w2v[word] **2)) 298 | if norm!=0: 299 | w2v[word]/= norm 300 | return w2v 301 | 302 | def transform_single(self, text): 303 | text= text.split(" ") 304 | if len(text)==0: return np.zeros(self.w2v_dim) 305 | w2v= self.w2v 306 | vecs= [] 307 | for word in text: 308 | if word in w2v: vecs.append(w2v[word]) 309 | else: vecs.append(np.zeros(self.w2v_dim)) 310 | if self.merge_vectors is not None: #Merge word vectors to a per-document vector 311 | if self.merge_vectors=="mean": #Currently only mean vector suppported, could do max, median, etc. 312 | vec= np.mean(vecs, axis=0) 313 | if self.normalize_merged is not None: #l1 and l2 normalization supported 314 | if self.normalize_merged == "l1": 315 | norm = sum(np.abs(vec)) 316 | else: 317 | norm = np.sqrt(sum(vec ** 2)) 318 | if norm != 0: 319 | vec /= norm 320 | return vec 321 | return vecs 322 | 323 | def transform(self, texts, y=None): 324 | #if batcher is None: return batch_transform(texts) 325 | # if self.shrink_model_transform == True: 326 | # #Send only word vectors occurring in texts to parallel processes. 327 | # #Use to reduce memory footprint with big embedding files. 328 | # d= wordbatch.transformers.dictionary.Dictionary(verbose=0, encode=False).fit(texts, input_split=input_split) 329 | # w2v_model2= {x:self.w2v[x] for x in [z for z in self.w2v.keys() if z in d.dft]} 330 | # fea_cfg2= self.fea_cfg 331 | # fea_cfg2['w2v_model']= w2v_model2 332 | # self_shrunk= WordVec(dictionary=None, fea_cfg=fea_cfg2) 333 | # else: self_shrunk= self 334 | #return batcher.process_batches(batch_transform, texts, [self], input_split=input_split, 335 | # merge_output=merge_output) 336 | return [self.transform_single(text) for text in texts] 337 | 338 | def fit(self, texts, y=None): 339 | return self 340 | 341 | def fit_transform(self, texts, y=None): 342 | return self.transform(texts, y) 343 | 344 | 345 | class Hstack: 346 | def __init__(self, extractors): 347 | self.extractors= extractors 348 | 349 | def transform(self, texts, y= None): 350 | return sp.hstack([x.transform(texts) for x in self.extractors]) 351 | 352 | def fit(self, texts, y=None): 353 | return self 354 | 355 | def fit_transform(self, texts, y=None): 356 | return self.transform(texts, y) 357 | 358 | 359 | class PandasHash: 360 | def __init__(self, *args, **kwargs): 361 | self.col_salt= None 362 | self.col_weight= None 363 | self.col_pick= [] 364 | self.dtype_specific= False 365 | for key, value in kwargs.items(): setattr(self, key.lower(), value) 366 | if self.col_salt is None or len(self.col_salt) == 0: 367 | self.col_salt = ["".join([z[0] for z in x.replace(" ", "_").replace("|", "_").split("_")]) 368 | for x in self.col_pick] 369 | if self.col_weight is None or len(self.col_weight) == 0: self.col_weight = np.ones(len(self.col_pick)) 370 | 371 | def transform(self, df, y= None): 372 | D= self.n_features 373 | col_pick= self.col_pick 374 | col_salt= self.col_salt 375 | col_weight= self.col_weight 376 | if not (self.dtype_specific): 377 | return indlist2csrmatrix( 378 | #indlist=np.array([np.vectorize(lambda x: hash(x) % D)(df[col].astype(str)+y) 379 | # for col, y in zip(col_pick, col_salt)]).T, 380 | indlist= np.array([[murmurhash3_32(x + y) % D for x in df[col].astype(str)] 381 | for col, y in zip(col_pick, col_salt)]).T, 382 | datalist= [col_weight] * len(df), 383 | shape= (len(df), D)) 384 | return indlist2csrmatrix( 385 | indlist = np.array([ 386 | [murmurhash3_32(y) % D] * len(df) 387 | if df[col].dtypes.name == 'bool' or np.issubdtype(df[col].dtypes, np.floating) else 388 | [murmurhash3_32(x + y) % D for x in df[col].astype(str)] 389 | for col, y in zip(col_pick, col_salt)]).T, 390 | datalist= col_weight * (np.array([ 391 | df[col] if df[col].dtypes.name == 'bool' or np.issubdtype(df[col].dtypes, np.floating) else 392 | np.ones(len(df)) 393 | for col, y in zip(col_pick, col_salt)]).T), 394 | shape = (len(df), D)) 395 | 396 | def fit(self, texts, y=None): 397 | return self 398 | 399 | def fit_transform(self, texts, y=None): 400 | return self.transform(texts, y) 401 | 402 | 403 | class CategoricalEncoder: 404 | def __init__(self, *args, **kwargs): 405 | self.dictionary= kwargs.get('dictionary', None) 406 | 407 | def transform(self, data, y= None): 408 | return [self.dictionary.word2id.get(x, self.dictionary.max_words+1) for x in data] 409 | 410 | def fit(self, data, y=None): 411 | self.dictionary.prune_dictionary(re_encode=True, prune_dfs=False) 412 | return self 413 | 414 | def fit_transform(self, data, y=None): 415 | self.fit(data) 416 | return self.transform(data, y) -------------------------------------------------------------------------------- /wordbatch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ftrl import FTRL 2 | from .ftrl32 import FTRL32 3 | from .fm_ftrl import FM_FTRL 4 | from .nn_relu_h1 import NN_ReLU_H1 5 | from .nn_relu_h2 import NN_ReLU_H2 6 | -------------------------------------------------------------------------------- /wordbatch/models/avx_ext.c: -------------------------------------------------------------------------------- 1 | #include "avx_ext.h" 2 | 3 | #define USE_AVX2 4 | #define USE_OMP 5 | 6 | #ifdef USE_OMP 7 | #include 8 | #endif 9 | 10 | #ifdef USE_AVX2 11 | 12 | #include 13 | 14 | #endif 15 | 16 | //#include //Deprecated 17 | #include 18 | #include 19 | 20 | double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta, 21 | double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm, int D_fm, 22 | int bias_term, int n_threads) { 23 | double e = 0.0; 24 | double e2 = 0.0; 25 | if (bias_term) 26 | if (*z!=0.0) e += *w = -*z / ((beta + sqrt(*n)) * ialpha); 27 | else *w = 0.0; 28 | int k, ii; 29 | 30 | /* 31 | #ifdef USE_OMP 32 | #pragma omp parallel for 33 | #endif 34 | */ 35 | for (ii = 0; ii < lenn; ii++) { 36 | const int i = inds[ii]+1; 37 | const double zi = z[i]; 38 | const double sign = (zi < 0) ? -1.0 : 1.0; 39 | if (sign * zi > L1) { 40 | const double wi = (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2); 41 | w[i] = wi; 42 | e += wi * vals[ii]; 43 | } else w[i] = 0.0; 44 | } 45 | 46 | int num_thread = 1; 47 | #ifdef USE_OMP 48 | if (n_threads <= 0) num_thread = omp_get_max_threads(); 49 | else num_thread = n_threads; 50 | #endif 51 | 52 | double* acwfmk = (double*)malloc(sizeof(double) * D_fm * num_thread); 53 | #ifdef USE_OMP 54 | #pragma omp parallel for num_threads(n_threads) private(k) 55 | #endif 56 | for (k = 0; k < D_fm * num_thread; k++) acwfmk[k] = 0.0; 57 | 58 | double* wi2_acc = (double*)malloc(sizeof(double) * num_thread * 4); 59 | 60 | double wi2 = 0.0; 61 | #ifdef USE_OMP 62 | #pragma omp parallel for num_threads(num_thread) private(k) 63 | #endif 64 | for (k = 0; k < num_thread * 4; k++) wi2_acc[k] = 0.0; 65 | 66 | #ifdef USE_OMP 67 | #pragma omp parallel for num_threads(num_thread) private(ii) 68 | #endif 69 | for (ii = 0; ii < lenn; ii++) { 70 | 71 | #ifdef USE_OMP 72 | const int i_thread = omp_get_thread_num(); 73 | #else 74 | const int i_thread = 0; 75 | #endif 76 | 77 | double* pAcwfmk = acwfmk + i_thread * D_fm; 78 | double* wi2_acck = wi2_acc + i_thread * 4; 79 | const int i = inds[ii]+1; 80 | double v = vals[ii]; 81 | int k = 0; 82 | double* z_fmik = z_fm + (i-1) * D_fm; 83 | double* w_fmk = pAcwfmk; 84 | 85 | #ifdef USE_AVX2 86 | __m256d v256 = _mm256_set1_pd(v); 87 | __m256d w2_256 = _mm256_loadu_pd(wi2_acck); 88 | while (k + 3 < D_fm) { 89 | __m256d d = _mm256_mul_pd(_mm256_loadu_pd(z_fmik), v256); 90 | _mm256_storeu_pd(w_fmk, _mm256_add_pd(_mm256_loadu_pd(w_fmk), d)); 91 | w2_256 = _mm256_add_pd(w2_256, _mm256_mul_pd(d, d)); 92 | k += 4; 93 | z_fmik += 4; 94 | w_fmk += 4; 95 | } 96 | _mm256_storeu_pd(wi2_acck, w2_256); 97 | #endif 98 | 99 | // Tail end 100 | double d; 101 | while(k < D_fm) { 102 | pAcwfmk[k++] += d = *z_fmik++ * v; 103 | wi2 += d*d; 104 | } 105 | } 106 | 107 | for (k = 0; k < D_fm; k++) { 108 | double wfmk = 0.0; 109 | for (int i_thread = 0; i_thread < num_thread;) wfmk += acwfmk[i_thread++ * D_fm + k]; 110 | *w_fm++ = wfmk; 111 | e2 += wfmk* wfmk; 112 | } 113 | 114 | for (k = 0; k < num_thread * 4;) wi2 += wi2_acc[k++]; 115 | 116 | free(acwfmk); 117 | free(wi2_acc); 118 | e2 = (e2 - wi2) * 0.5 * weight_fm; 119 | return e + e2; 120 | } 121 | 122 | void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w, double* z, 123 | double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm, double* n_fm, int D_fm, int bias_term, 124 | int n_threads) { 125 | 126 | #ifdef USE_OMP 127 | int num_thread; 128 | if (n_threads <= 0) num_thread = omp_get_max_threads(); 129 | else num_thread = n_threads; 130 | #endif 131 | 132 | const double e_sq = e * e; 133 | 134 | if (bias_term) { 135 | *z += e - ((sqrt(*n + e_sq) - sqrt(*n)) * ialpha) * *w; 136 | *n += e_sq; 137 | } 138 | const double L2_fme = L2_fm / e; 139 | 140 | int ii; 141 | #ifdef USE_OMP 142 | #pragma omp parallel for num_threads(num_thread) private(ii) 143 | #endif 144 | for (ii = 0; ii < lenn; ii++) { 145 | const int i = inds[ii]+1; 146 | const double v = vals[ii]; 147 | const double g = e * v; 148 | const double g2 = g * g; 149 | const double ni = n[i]; 150 | 151 | z[i] += g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i]; 152 | n[i] += g2; 153 | 154 | double* z_fmik = z_fm + (i-1) * D_fm; 155 | double* w_fmk = w_fm; 156 | const double lr = g* alpha_fm / (sqrt(n_fm[i]) + 1.0); 157 | const double reg = v - L2_fme; 158 | 159 | int k = 0; 160 | #ifdef USE_AVX2 161 | __m256d reg2 = _mm256_set1_pd(reg); 162 | __m256d lr2 = _mm256_set1_pd(lr); 163 | while (k + 3 < D_fm) { 164 | __m256d z0 = _mm256_loadu_pd(z_fmik); 165 | _mm256_storeu_pd(z_fmik, 166 | _mm256_sub_pd(z0, _mm256_mul_pd(lr2, 167 | _mm256_sub_pd(_mm256_loadu_pd(w_fmk), 168 | _mm256_mul_pd(z0, reg2))))); 169 | w_fmk+= 4; 170 | z_fmik+= 4; 171 | k+= 4; 172 | } 173 | #endif 174 | while (k++ < D_fm) *z_fmik++ -= lr * (*w_fmk++ - *z_fmik * reg); // Tail end 175 | 176 | n_fm[i] += e_sq; 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /wordbatch/models/avx_ext.h: -------------------------------------------------------------------------------- 1 | #ifndef _AVX_EXT_H 2 | #define _AVX_EXT_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta, 9 | double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm, int D_fm, 10 | int bias_term, int nThreads); 11 | 12 | int halfer_EXT_INT(int d); 13 | 14 | void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w, double* z, 15 | double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm, double* n_fm, int D_fm, int bias_term, 16 | int nThreads); 17 | 18 | int doubler_EXT(int d); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif // _AVX_EXT_H 25 | -------------------------------------------------------------------------------- /wordbatch/models/fm_ftrl.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False, cdivision=True 2 | import numpy as np 3 | import gzip 4 | cimport cython 5 | from cpython cimport array 6 | import scipy.sparse as ssp 7 | cimport numpy as np 8 | from cython.parallel import prange 9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs 10 | import multiprocessing 11 | import sys 12 | import randomgen 13 | 14 | np.import_array() 15 | 16 | cdef extern from "avx_ext.h":# nogil: 17 | void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w, 18 | double* z, double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm, 19 | double* n_fm, int D_fm, int bias_term, int nThreads); 20 | double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, 21 | double beta, double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, 22 | double weight_fm, int D_fm, int bias_term, int nThreads); 23 | 24 | 25 | cdef double inv_link_f(double e, int inv_link) nogil: 26 | if inv_link==1: return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss 27 | return e 28 | 29 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta, 30 | double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm, 31 | int D_fm, bint bias_term, int threads) nogil: 32 | cdef int i, ii, k 33 | cdef double sign, zi, d, wi, wi2, wfmk, e= 0.0, e2= 0.0 34 | 35 | if bias_term: 36 | if z[0] != 0: 37 | wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha) 38 | e += wi 39 | else: w[0] = 0.0 40 | 41 | for ii in prange(lenn, nogil=True, num_threads= threads): 42 | i= inds[ii]+1 43 | zi= z[i] 44 | sign= -1.0 if zi < 0 else 1.0 45 | if sign * zi > L1: 46 | w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2) 47 | e+= wi * vals[ii] 48 | else: w[ii+1] = 0.0 49 | 50 | wi2= 0.0 51 | for k in prange(D_fm, nogil=True, num_threads=threads): 52 | wfmk= 0.0 53 | for ii in range(lenn): 54 | d= z_fm[inds[ii] * D_fm + k] * vals[ii] 55 | wfmk= wfmk+d 56 | wi2+= d **2 57 | e2+= wfmk **2 58 | w_fm[k]= wfmk 59 | e2= (e2- wi2)* 0.5 *weight_fm 60 | return e+e2 61 | 62 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, double* w, double* z, double* n, 63 | double alpha_fm, double L2_fm, double* w_fm, double* z_fm, double* n_fm, 64 | int D_fm, bint bias_term, int threads) nogil: 65 | cdef int i, ii, k 66 | cdef double g, g2, ni, v, lr, e2= e**2, reg, L2_fme= L2_fm / e 67 | cdef double *z_fmi 68 | if bias_term: #Update bias with FTRL-proximal 69 | g2= e ** 2 70 | ni= n[0] 71 | z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0] 72 | n[0]+= g2 73 | 74 | for ii in prange(lenn, nogil=True, num_threads= threads): 75 | #for ii in range(lenn): 76 | i= inds[ii]+1 77 | v= vals[ii] 78 | #Update 1st order model with FTRL-proximal 79 | g= e * v 80 | g2= g * g 81 | ni= n[i] 82 | z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i] 83 | n[i]+= g2 84 | 85 | #Update FM with adaptive regularized SGD 86 | z_fmi= z_fm+ (i-1) * D_fm 87 | lr= g* alpha_fm / (sqrt(n_fm[i])+1.0) 88 | reg= v - L2_fme 89 | for k in range(D_fm): z_fmi[k]-= lr * (w_fm[k] - z_fmi[k] * reg) 90 | n_fm[i] += e2 91 | 92 | cdef class FM_FTRL: 93 | cdef const double[:] w 94 | cdef const double[:] z 95 | cdef const double[:] n 96 | cdef const double[:] w_fm 97 | cdef const double[:] z_fm 98 | cdef const double[:] n_fm 99 | 100 | cdef unsigned int threads 101 | cdef unsigned int iters 102 | cdef unsigned int D 103 | cdef unsigned int D_fm 104 | cdef double L1 105 | cdef double L2 106 | cdef double alpha 107 | cdef double beta 108 | cdef double alpha_fm 109 | cdef double L2_fm 110 | cdef double weight_fm 111 | cdef double init_fm 112 | cdef double e_noise 113 | cdef double e_clip 114 | cdef int inv_link 115 | cdef bint bias_term 116 | cdef int use_avx 117 | cdef int seed 118 | cdef int verbose 119 | 120 | def __init__(self, 121 | double alpha=0.02, 122 | double beta=0.01, # ~ alpha/2 123 | double L1=0.0001, 124 | double L2=0.1, 125 | unsigned int D=0, 126 | double alpha_fm=0.03, 127 | double L2_fm= 0.005, 128 | double init_fm= 0.01, 129 | unsigned int D_fm=20, 130 | double weight_fm= 10.0, 131 | double e_noise= 0.0001, 132 | double e_clip= 1.0, 133 | unsigned int iters=5, 134 | inv_link= "identity", 135 | bint bias_term=1, 136 | int threads= 0, 137 | int use_avx=1, 138 | int seed= 0, 139 | int verbose=1): 140 | 141 | self.alpha= alpha 142 | self.beta= beta 143 | self.L1= L1 144 | self.L2= L2 145 | self.D= D 146 | self.alpha_fm= alpha_fm 147 | self.L2_fm= L2_fm 148 | self.init_fm= init_fm 149 | self.D_fm= D_fm 150 | self.weight_fm= weight_fm 151 | self.e_noise= e_noise 152 | self.e_clip= e_clip 153 | self.iters= iters 154 | if threads==0: threads= multiprocessing.cpu_count()-1 155 | self.threads= threads 156 | if inv_link=="sigmoid": self.inv_link= 1 157 | if inv_link=="identity": self.inv_link= 0 158 | self.bias_term= bias_term 159 | self.use_avx = use_avx 160 | self.seed = seed 161 | self.verbose= verbose 162 | self.reset() 163 | 164 | def reset(self): 165 | D= self.D 166 | D_fm= self.D_fm 167 | self.w = np.ones((D+1), dtype=np.float64) 168 | self.z = np.zeros((D+1), dtype=np.float64) 169 | self.n = np.zeros((D+1), dtype=np.float64) 170 | self.w_fm = np.zeros(D_fm, dtype=np.float64) 171 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 172 | self.z_fm = (rand.random(D * D_fm) - 0.5) * self.init_fm 173 | self.n_fm = np.zeros(D+1, dtype=np.float64) 174 | 175 | def predict(self, X, int threads= 0): 176 | if threads==0: threads= self.threads 177 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 178 | if X.shape[1] != self.D: 179 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 180 | # return self.predict_f(np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 181 | # np.ascontiguousarray(X.indptr), threads) 182 | return self.predict_f(X.data, X.indices, X.indptr, threads) 183 | 184 | def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 185 | np.ndarray[int, ndim=1, mode='c'] X_indices, 186 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads): 187 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, \ 188 | weight_fm= self.weight_fm 189 | cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *n_fm= &self.n_fm[0], \ 190 | *z_fm= &self.z_fm[0], *w_fm= &self.w_fm[0] 191 | cdef unsigned int D_fm= self.D_fm, k 192 | p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64) 193 | cdef double[:] pp= p 194 | cdef unsigned int lenn, row_count= X_indptr.shape[0]-1, row, ptr 195 | cdef bint bias_term= self.bias_term 196 | for row in range(row_count): 197 | ptr= X_indptr[row] 198 | lenn= X_indptr[row + 1] - ptr 199 | inds= X_indices.data + ptr 200 | vals= X_data.data + ptr 201 | 202 | if self.use_avx == 1: 203 | pp[row]= inv_link_f(predict_fm_ftrl_avx(inds, vals, lenn, 204 | L1, baL2, ialpha, beta, w, z, n, 205 | w_fm, z_fm, n_fm, weight_fm, 206 | D_fm, bias_term, threads), self.inv_link) 207 | else: 208 | pp[row]= inv_link_f(predict_single(inds, vals, lenn, 209 | L1, baL2, ialpha, beta, w, z, n, 210 | w_fm, z_fm, n_fm, weight_fm, 211 | D_fm, bias_term, threads), self.inv_link) 212 | return p 213 | 214 | 215 | def partial_fit(self, X, y, sample_weight= None, int threads = 0, int seed = 0): 216 | return self.fit(X, y, sample_weight= sample_weight, threads = threads, seed = seed, reset= False) 217 | 218 | def fit(self, X, y, sample_weight= None, int threads= 0, int seed= 0, reset= True): 219 | if threads == 0: threads= self.threads 220 | if type(X) != ssp.csr.csr_matrix: X = ssp.csr_matrix(X, dtype=np.float64) 221 | if reset or self.D==0: 222 | self.D= X.shape[1] 223 | self.reset() 224 | elif X.shape[1] != self.D: 225 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 226 | #if type(y) != np.array: y = np.array(y, dtype=np.float64) 227 | y= np.ascontiguousarray(y, dtype=np.float64) 228 | if sample_weight is not None and type(sample_weight) != np.array: 229 | sample_weight= np.array(sample_weight, dtype=np.float64) 230 | return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads, seed) 231 | 232 | def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 233 | np.ndarray[int, ndim=1, mode='c'] X_indices, 234 | np.ndarray[int, ndim=1, mode='c'] X_indptr, 235 | np.ndarray[double, ndim=1, mode='c'] y, 236 | sample_weight, 237 | int threads, int seed): 238 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, \ 239 | alpha_fm= self.alpha_fm, weight_fm= self.weight_fm, L2_fm= self.L2_fm, e, e_total= 0, zfmi, \ 240 | e_noise= self.e_noise, e_clip= self.e_clip, abs_e 241 | cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *n_fm= &self.n_fm[0], \ 242 | *z_fm= &self.z_fm[0], *w_fm= &self.w_fm[0], *ys= y.data 243 | cdef int D_fm= self.D_fm, lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link 244 | cdef bint bias_term= self.bias_term 245 | cdef int* inds, indptr 246 | cdef double* vals 247 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 248 | for iter in range(self.iters): 249 | e_total= 0.0 250 | for row in range(row_count): 251 | ptr= X_indptr[row] 252 | lenn= X_indptr[row+1]-ptr 253 | inds= X_indices.data+ptr 254 | vals= X_data.data+ptr 255 | 256 | if self.use_avx == 1: 257 | e = inv_link_f(predict_fm_ftrl_avx(inds, vals, lenn, 258 | L1, baL2, ialpha, beta, w, z, n, 259 | w_fm, z_fm, n_fm, weight_fm, 260 | D_fm, bias_term, threads), inv_link) - ys[row] 261 | else: 262 | e= inv_link_f(predict_single(inds, vals, lenn, 263 | L1, baL2, ialpha, beta, w, z, n, 264 | w_fm, z_fm, n_fm, weight_fm, 265 | D_fm, bias_term, threads), inv_link) -ys[row] 266 | 267 | abs_e= fabs(e) 268 | e_total+= abs_e 269 | e += (rand.random() - 0.5) * e_noise 270 | if abs_e> e_clip: 271 | if e>0: e= e_clip 272 | else: e= -e_clip 273 | if sample_weight is not None: 274 | e*= sample_weight[row] 275 | 276 | if self.use_avx == 1: 277 | update_fm_ftrl_avx(inds, vals, lenn, e, ialpha, w, z, n, alpha_fm, L2_fm, w_fm, z_fm, n_fm, D_fm, 278 | bias_term, threads) 279 | else: 280 | update_single(inds, vals, lenn, e, ialpha, w, z, n, alpha_fm, L2_fm, w_fm, z_fm, n_fm, D_fm, 281 | bias_term, threads) 282 | 283 | if self.verbose>0: print "Total e:", e_total 284 | return self 285 | 286 | def __getstate__(self): 287 | return (self.alpha, 288 | self.beta, 289 | self.L1, 290 | self.L2, 291 | self.alpha_fm, 292 | self.L2_fm, 293 | self.e_noise, 294 | self.e_clip, 295 | self.weight_fm, 296 | self.init_fm, 297 | self.D, 298 | self.D_fm, 299 | self.iters, 300 | np.asarray(self.w), 301 | np.asarray(self.z), 302 | np.asarray(self.n), 303 | np.asarray(self.w_fm), 304 | np.asarray(self.z_fm), 305 | np.asarray(self.n_fm), 306 | self.inv_link, 307 | self.seed, 308 | self.use_avx, 309 | self.bias_term, 310 | self.threads, 311 | self.verbose) 312 | 313 | def __setstate__(self, params): 314 | (self.alpha, 315 | self.beta, 316 | self.L1, 317 | self.L2, 318 | self.alpha_fm, 319 | self.L2_fm, 320 | self.e_noise, 321 | self.e_clip, 322 | self.weight_fm, 323 | self.init_fm, 324 | self.D, 325 | self.D_fm, 326 | self.iters, 327 | self.w, 328 | self.z, 329 | self.n, 330 | self.w_fm, 331 | self.z_fm, 332 | self.n_fm, 333 | self.inv_link, 334 | self.seed, 335 | self.use_avx, 336 | self.bias_term, 337 | self.threads, 338 | self.verbose)= params 339 | -------------------------------------------------------------------------------- /wordbatch/models/ftrl.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False, cdivision=True 2 | import numpy as np 3 | cimport cython 4 | from cpython cimport array 5 | import scipy.sparse as ssp 6 | cimport numpy as np 7 | from cython.parallel import prange 8 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs 9 | import multiprocessing 10 | import randomgen 11 | 12 | np.import_array() 13 | 14 | cdef double inv_link_f(double e, int inv_link) nogil: 15 | if inv_link==1: return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss 16 | return e 17 | 18 | cdef double predict_single_finalized(int* inds, double* vals, int lenn, double* w, bint bias_term, int threads) nogil: 19 | cdef int ii 20 | cdef double e= 0.0 21 | if bias_term: 22 | e += w[0] 23 | for ii in prange(lenn, nogil=True, num_threads= threads): 24 | e+= w[inds[ii]+1] * vals[ii] 25 | return e 26 | 27 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta, 28 | double* w, double* z, double* n, bint bias_term, int threads) nogil: 29 | cdef int i, ii 30 | cdef double sign, zi, wi 31 | cdef double e= 0.0 32 | if bias_term: 33 | if z[0] != 0: 34 | wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha) 35 | e += wi 36 | else: w[0] = 0.0 37 | 38 | for ii in prange(lenn, nogil=True, num_threads= threads): 39 | i= inds[ii]+1 40 | zi= z[i] 41 | sign = -1.0 if zi < 0 else 1.0 42 | if sign * zi > L1: 43 | w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2) 44 | e+= wi * vals[ii] 45 | else: w[i]= 0.0 46 | return e 47 | 48 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, double* w, double* z, 49 | double* n, bint bias_term, int threads) nogil: 50 | cdef int i, ii 51 | cdef double g, g2, ni 52 | if bias_term: 53 | g2= e ** 2 54 | ni= n[0] 55 | z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0] 56 | n[0]+= g2 57 | 58 | for ii in prange(lenn, nogil=True, num_threads= threads): 59 | i= inds[ii]+1 60 | g= e * vals[ii] 61 | g2= g ** 2 62 | ni= n[i] 63 | #z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[ii+1] 64 | z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i] 65 | n[i]+= g2 66 | 67 | cdef class FTRL: 68 | cdef const double[:] w 69 | cdef const double[:] z 70 | cdef const double[:] n 71 | 72 | cdef unsigned int threads 73 | cdef unsigned int iters 74 | cdef unsigned int D 75 | cdef double L1 76 | cdef double L2 77 | cdef double alpha 78 | cdef double beta 79 | cdef double init 80 | cdef double e_clip 81 | cdef int inv_link 82 | cdef bint bias_term 83 | cdef int seed 84 | cdef int verbose 85 | cdef bint model_finalized 86 | 87 | def __init__(self, 88 | double alpha=0.1, 89 | double beta=1.0, 90 | double L1=1.0, 91 | double L2=1.0, 92 | unsigned int D=0, 93 | double init= 0.0, 94 | unsigned int iters=10, 95 | double e_clip= 1.0, 96 | int threads= 0, 97 | inv_link= "sigmoid", 98 | bint bias_term=1, 99 | int seed= 0, 100 | int verbose=1): 101 | 102 | self.alpha= alpha 103 | self.beta= beta 104 | self.L1= L1 105 | self.L2= L2 106 | self.init= init 107 | self.e_clip= e_clip 108 | self.D= D 109 | self.iters= iters 110 | if threads==0: threads= multiprocessing.cpu_count()-1 111 | self.threads= threads 112 | if inv_link=="sigmoid": self.inv_link= 1 113 | if inv_link=="identity": self.inv_link= 0 114 | self.bias_term= bias_term 115 | self.seed = seed 116 | self.verbose= verbose 117 | self.model_finalized= False 118 | self.reset() 119 | 120 | def reset(self): 121 | D= self.D 122 | self.w = np.zeros((D+1,), dtype=np.float64) 123 | if self.init==0: 124 | self.z = np.zeros((D+1,), dtype=np.float64) 125 | else: 126 | rand= randomgen.xoroshiro128.Xoroshiro128(seed= self.seed).generator 127 | self.z = (rand.random_sample(D+1) - 0.5) * self.init 128 | self.n = np.zeros((D+1,), dtype=np.float64) 129 | self.model_finalized= False 130 | 131 | def predict(self, X, int threads= 0): 132 | if threads==0: threads= self.threads 133 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 134 | if X.shape[1] != self.D: 135 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 136 | # return self.predict_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 137 | # np.ascontiguousarray(X.indptr), threads) 138 | return self.predict_f(X.data, X.indices, X.indptr, threads) 139 | 140 | def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 141 | np.ndarray[int, ndim=1, mode='c'] X_indices, 142 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads): 143 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2 144 | p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64) 145 | cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0] 146 | cdef double[:] pp= p 147 | cdef unsigned lenn, row_count= X_indptr.shape[0]-1, row, ptr 148 | cdef bint bias_term= self.bias_term 149 | for row in range(row_count): 150 | ptr= X_indptr[row] 151 | lenn= X_indptr[row + 1] - ptr 152 | inds= X_indices.data + ptr 153 | vals= X_data.data + ptr 154 | if self.model_finalized: 155 | pp[row]= inv_link_f(predict_single_finalized(inds, vals, lenn, w, bias_term, threads), self.inv_link) 156 | else: 157 | pp[row]= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, 158 | bias_term, threads), self.inv_link) 159 | return p 160 | 161 | def partial_fit(self, X, y, sample_weight= None, int threads = 0): 162 | return self.fit(X, y, sample_weight= sample_weight, threads = threads, reset= False) 163 | 164 | def fit(self, X, y, sample_weight= None, int threads= 0, reset= True): 165 | if threads == 0: threads= self.threads 166 | if type(X) != ssp.csr.csr_matrix: X = ssp.csr_matrix(X, dtype=np.float64) 167 | if reset or self.D==0: 168 | self.D= X.shape[1] 169 | self.reset() 170 | elif X.shape[1] != self.D: 171 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 172 | #if type(y) != np.array: y = np.array(y, dtype=np.float64) 173 | y= np.ascontiguousarray(y, dtype=np.float64) 174 | if sample_weight is not None and type(sample_weight) != np.array: 175 | sample_weight= np.array(sample_weight, dtype=np.float64) 176 | # self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 177 | # np.ascontiguousarray(X.indptr), y, threads) 178 | return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads) 179 | 180 | def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 181 | np.ndarray[int, ndim=1, mode='c'] X_indices, 182 | np.ndarray[int, ndim=1, mode='c'] X_indptr, 183 | np.ndarray[double, ndim=1, mode='c'] y, 184 | sample_weight, int threads): 185 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, e, e_total= 0,\ 186 | e_clip= self.e_clip, abs_e 187 | cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *ys= y.data 188 | cdef int lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link, j=0, jj 189 | cdef bint bias_term= self.bias_term 190 | cdef int* inds, indptr 191 | cdef double* vals 192 | for iter in range(self.iters): 193 | e_total= 0.0 194 | for row in range(row_count): 195 | ptr= X_indptr[row] 196 | lenn= X_indptr[row+1]-ptr 197 | inds= X_indices.data+ptr 198 | vals= X_data.data+ptr 199 | e= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, bias_term, threads), 200 | inv_link)-ys[row] 201 | abs_e= fabs(e) 202 | e_total+= abs_e 203 | if abs_e > e_clip: 204 | if e > 0: e= e_clip 205 | else: e= -e_clip 206 | if sample_weight is not None: 207 | e*= sample_weight[row] 208 | update_single(inds, vals, lenn, e, ialpha, w, z, n, bias_term, threads) 209 | if self.verbose > 0: print "Total e:", e_total 210 | return self 211 | 212 | def finalize_model(self): 213 | D= self.D 214 | indices = np.arange(start=0, stop=D, step=1, dtype=np.int32) 215 | indptr= np.array([0, D], dtype=np.int32) 216 | data = np.zeros(D, dtype=np.float64) 217 | self.predict_f(data, indices, indptr, threads= self.threads) 218 | del(indices, indptr, data) 219 | self.z = np.zeros(0, dtype=np.float64) 220 | self.n = np.zeros(0, dtype=np.float64) 221 | self.model_finalized= True 222 | 223 | def __getstate__(self): 224 | return (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, 225 | np.asarray(self.w), np.asarray(self.z), np.asarray(self.n), self.inv_link, self.threads, self.bias_term, 226 | self.model_finalized, self.verbose) 227 | 228 | def __setstate__(self, params): 229 | (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, self.w, 230 | self.z, self.n, self.inv_link, self.threads, self.bias_term, self.model_finalized, self.verbose)= params 231 | -------------------------------------------------------------------------------- /wordbatch/models/ftrl32.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False, cdivision=True 2 | import numpy as np 3 | cimport cython 4 | from cpython cimport array 5 | import scipy.sparse as ssp 6 | cimport numpy as np 7 | from cython.parallel import prange 8 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs 9 | import multiprocessing 10 | import randomgen 11 | 12 | np.import_array() 13 | 14 | cdef double inv_link_f(double e, int inv_link) nogil: 15 | if inv_link==1: return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss 16 | return e 17 | 18 | cdef double predict_single_finalized(int* inds, double* vals, int lenn, float* w, bint bias_term, int threads) nogil: 19 | cdef int ii 20 | cdef double e= 0.0 21 | if bias_term: 22 | e += w[0] 23 | for ii in prange(lenn, nogil=True, num_threads= threads): 24 | e+= w[inds[ii]+1] * vals[ii] 25 | return e 26 | 27 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta, 28 | float* w, float* z, float* n, bint bias_term, int threads) nogil: 29 | cdef int i, ii 30 | cdef double sign, zi, wi 31 | cdef double e= 0.0 32 | if bias_term: 33 | if z[0] != 0: 34 | wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha) 35 | e += wi 36 | else: w[0] = 0.0 37 | 38 | for ii in prange(lenn, nogil=True, num_threads= threads): 39 | i= inds[ii]+1 40 | zi= z[i] 41 | sign = -1.0 if zi < 0 else 1.0 42 | if sign * zi > L1: 43 | w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2) 44 | e+= wi * vals[ii] 45 | else: w[i]= 0.0 46 | return e 47 | 48 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, float* w, float* z, 49 | float* n, bint bias_term, int threads) nogil: 50 | cdef int i, ii 51 | cdef double g, g2, ni 52 | if bias_term: 53 | g2= e ** 2 54 | ni= n[0] 55 | z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0] 56 | n[0]+= g2 57 | 58 | for ii in prange(lenn, nogil=True, num_threads= threads): 59 | i= inds[ii]+1 60 | g= e * vals[ii] 61 | g2= g ** 2 62 | ni= n[i] 63 | #z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[ii+1] 64 | z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i] 65 | n[i]+= g2 66 | 67 | cdef class FTRL32: 68 | cdef const float[:] w 69 | cdef const float[:] z 70 | cdef const float[:] n 71 | 72 | cdef unsigned int threads 73 | cdef unsigned int iters 74 | cdef unsigned int D 75 | cdef double L1 76 | cdef double L2 77 | cdef double alpha 78 | cdef double beta 79 | cdef double init 80 | cdef double e_clip 81 | cdef int inv_link 82 | cdef bint bias_term 83 | cdef int seed 84 | cdef int verbose 85 | cdef bint model_finalized 86 | 87 | def __init__(self, 88 | double alpha=0.1, 89 | double beta=1.0, 90 | double L1=1.0, 91 | double L2=1.0, 92 | unsigned int D=0, 93 | double init= 0.0, 94 | unsigned int iters=10, 95 | double e_clip= 1.0, 96 | int threads= 0, 97 | inv_link= "sigmoid", 98 | bint bias_term=1, 99 | int seed= 0, 100 | int verbose=1): 101 | 102 | self.alpha= alpha 103 | self.beta= beta 104 | self.L1= L1 105 | self.L2= L2 106 | self.init= init 107 | self.e_clip= e_clip 108 | self.D= D 109 | self.iters= iters 110 | if threads==0: threads= multiprocessing.cpu_count()-1 111 | self.threads= threads 112 | if inv_link=="sigmoid": self.inv_link= 1 113 | if inv_link=="identity": self.inv_link= 0 114 | self.bias_term= bias_term 115 | self.seed = seed 116 | self.verbose= verbose 117 | self.model_finalized= False 118 | self.reset() 119 | 120 | def reset(self): 121 | D= self.D 122 | self.w = np.zeros((D+1,), dtype=np.float32) 123 | if self.init==0: 124 | self.z = np.zeros((D+1,), dtype=np.float32) 125 | else: 126 | rand= randomgen.xoroshiro128.Xoroshiro128(seed= self.seed).generator 127 | self.z = np.float32((rand.random_sample(D+1) - 0.5) * self.init) 128 | self.n = np.zeros((D+1,), dtype=np.float32) 129 | self.model_finalized= False 130 | 131 | def predict(self, X, int threads= 0): 132 | if threads==0: threads= self.threads 133 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 134 | if X.shape[1] != self.D: 135 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 136 | # return self.predict_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 137 | # np.ascontiguousarray(X.indptr), threads) 138 | return self.predict_f(X.data, X.indices, X.indptr, threads) 139 | 140 | def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 141 | np.ndarray[int, ndim=1, mode='c'] X_indices, 142 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads): 143 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2 144 | p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64) 145 | cdef float *w= &self.w[0], *z= &self.z[0], *n= &self.n[0] 146 | cdef double[:] pp= p 147 | cdef unsigned lenn, row_count= X_indptr.shape[0]-1, row, ptr 148 | cdef bint bias_term= self.bias_term 149 | for row in range(row_count): 150 | ptr= X_indptr[row] 151 | lenn= X_indptr[row + 1] - ptr 152 | inds= X_indices.data + ptr 153 | vals= X_data.data + ptr 154 | if self.model_finalized: 155 | pp[row]= inv_link_f(predict_single_finalized(inds, vals, lenn, w, bias_term, threads), self.inv_link) 156 | else: 157 | pp[row]= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, 158 | bias_term, threads), self.inv_link) 159 | return p 160 | 161 | def partial_fit(self, X, y, sample_weight= None, int threads = 0): 162 | return self.fit(X, y, sample_weight= sample_weight, threads = threads, reset= False) 163 | 164 | def fit(self, X, y, sample_weight= None, int threads= 0, reset= True): 165 | if threads == 0: threads= self.threads 166 | if type(X) != ssp.csr.csr_matrix: X = ssp.csr_matrix(X, dtype=np.float64) 167 | if reset or self.D==0: 168 | self.D= X.shape[1] 169 | self.reset() 170 | elif X.shape[1] != self.D: 171 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 172 | #if type(y) != np.array: y = np.array(y, dtype=np.float64) 173 | y= np.ascontiguousarray(y, dtype=np.float64) 174 | if sample_weight is not None and type(sample_weight) != np.array: 175 | sample_weight= np.array(sample_weight, dtype=np.float64) 176 | # self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 177 | # np.ascontiguousarray(X.indptr), y, threads) 178 | return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads) 179 | 180 | def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 181 | np.ndarray[int, ndim=1, mode='c'] X_indices, 182 | np.ndarray[int, ndim=1, mode='c'] X_indptr, 183 | np.ndarray[double, ndim=1, mode='c'] y, 184 | sample_weight, int threads): 185 | cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, e, e_total= 0,\ 186 | e_clip= self.e_clip, abs_e 187 | cdef float *w= &self.w[0], *z= &self.z[0], *n= &self.n[0] 188 | cdef double *ys= y.data 189 | cdef int lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link, j=0, jj 190 | cdef bint bias_term= self.bias_term 191 | cdef int* inds, indptr 192 | cdef double* vals 193 | for iter in range(self.iters): 194 | e_total= 0.0 195 | for row in range(row_count): 196 | ptr= X_indptr[row] 197 | lenn= X_indptr[row+1]-ptr 198 | inds= X_indices.data+ptr 199 | vals= X_data.data+ptr 200 | e= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, bias_term, threads), 201 | inv_link)-ys[row] 202 | abs_e= fabs(e) 203 | e_total+= abs_e 204 | if abs_e > e_clip: 205 | if e > 0: e= e_clip 206 | else: e= -e_clip 207 | if sample_weight is not None: 208 | e*= sample_weight[row] 209 | update_single(inds, vals, lenn, e, ialpha, w, z, n, bias_term, threads) 210 | if self.verbose > 0: print "Total e:", e_total 211 | return self 212 | 213 | def finalize_model(self): 214 | D= self.D 215 | indices = np.arange(start=0, stop=D, step=1, dtype=np.int32) 216 | indptr= np.array([0, D], dtype=np.int32) 217 | data = np.zeros(D, dtype=np.float64) 218 | self.predict_f(data, indices, indptr, threads= self.threads) 219 | del(indices, indptr, data) 220 | self.z = np.zeros(0, dtype=np.float32) 221 | self.n = np.zeros(0, dtype=np.float32) 222 | self.model_finalized= True 223 | 224 | def __getstate__(self): 225 | return (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, 226 | np.asarray(self.w), np.asarray(self.z), np.asarray(self.n), self.inv_link, self.threads, self.bias_term, 227 | self.model_finalized, self.verbose) 228 | 229 | def __setstate__(self, params): 230 | (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, self.w, 231 | self.z, self.n, self.inv_link, self.threads, self.bias_term, self.model_finalized, self.verbose)= params 232 | -------------------------------------------------------------------------------- /wordbatch/models/nn_relu_h1.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False, cdivision=True 2 | import numpy as np 3 | import gzip 4 | cimport cython 5 | from cpython cimport array 6 | import scipy.sparse as ssp 7 | cimport numpy as np 8 | from cython.parallel import prange 9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs 10 | import multiprocessing 11 | import sys 12 | import randomgen 13 | 14 | np.import_array() 15 | 16 | cdef double inv_link_f(double e, int inv_link) nogil: 17 | if inv_link==1: return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss 18 | return e 19 | 20 | cdef double predict_single(int* inds, double* vals, int lenn, int D, int D_nn, 21 | double* w0, double* w1, double* z, int threads) nogil: 22 | cdef int j, i, ii, DD_nn= D*D_nn 23 | cdef double p, v, zj 24 | p= w1[D_nn] 25 | for j in prange(D_nn, nogil=True, num_threads= threads): 26 | zj= w0[DD_nn+j] 27 | for ii in range(lenn): 28 | zj= zj+ w0[inds[ii]*D_nn+j] * vals[ii] 29 | if zj<=0: z[j]= 0 30 | else: 31 | z[j]= zj 32 | p+= w1[j] * zj 33 | return p 34 | 35 | cdef void update_single(int* inds, double* vals, int lenn, int D, int D_nn, double e, double alpha, 36 | double L2, double* w0, double* w1, double* z, double* c0, double* c1, int threads) nogil: 37 | cdef int i, ii, j, DD_nn= D*D_nn, iDnnj 38 | cdef double dldy= e, dldz, dldw1, dldw0 39 | w1[D_nn]-= (dldy+ L2 * w1[D_nn]) * alpha 40 | #for j in prange(D_nn, nogil=True, num_threads=threads): 41 | for j in range(D_nn): 42 | if z[j]==0: continue 43 | dldw1= dldy * z[j] 44 | w1[j]-= (dldw1 + L2 * w1[j]) * alpha/(sqrt(c1[j])+1) 45 | dldz= dldy * w1[j] 46 | w0[DD_nn+j]-= (dldz+ L2 *w0[DD_nn+j]) * alpha/(sqrt(c1[j])+1) 47 | 48 | #for ii in range(lenn): 49 | for ii in prange(lenn, nogil=True, num_threads=threads): 50 | i= inds[ii] 51 | dldw0= dldz * vals[ii] 52 | w0[i * D_nn + j]-= (dldw0 + L2 *w0[i * D_nn + j]) * alpha/(sqrt(c0[i])+1) 53 | c0[i]+= fabs(dldw0) 54 | c1[j]+= fabs(dldw1) 55 | 56 | cdef class NN_ReLU_H1: 57 | cdef const double[:] w0 58 | cdef const double[:] w1 59 | cdef const double[:] z 60 | cdef const double[:] c0 61 | cdef const double[:] c1 62 | 63 | cdef unsigned int threads 64 | cdef unsigned int iters 65 | cdef int D 66 | cdef int D_nn 67 | cdef double init_nn 68 | 69 | cdef double L2 70 | cdef double alpha 71 | cdef double e_noise 72 | cdef double e_clip 73 | cdef int inv_link 74 | cdef int seed 75 | cdef int verbose 76 | 77 | def __init__(self, 78 | double alpha=0.1, 79 | double L2=0.001, 80 | int D=0, 81 | int D_nn=30, 82 | double init_nn=0.01, 83 | double e_noise=0.0001, 84 | double e_clip=1.0, 85 | unsigned int iters=4, 86 | inv_link= "identity", 87 | int threads= 0, 88 | int seed= 0, 89 | int verbose=1): 90 | 91 | self.alpha= alpha 92 | self.L2= L2 93 | self.D= D 94 | self.D_nn= D_nn 95 | self.init_nn= init_nn 96 | self.e_noise= e_noise 97 | self.e_clip= e_clip 98 | self.iters= iters 99 | if threads==0: threads= multiprocessing.cpu_count()-1 100 | self.threads= threads 101 | if inv_link=="sigmoid": self.inv_link= 1 102 | if inv_link=="identity": self.inv_link= 0 103 | self.seed = seed 104 | self.verbose= verbose 105 | self.reset() 106 | 107 | def reset(self): 108 | init_nn= self.init_nn 109 | D= self.D 110 | D_nn= self.D_nn 111 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 112 | self.w0 = (rand.random((D + 1) * D_nn) - 0.5) * init_nn 113 | self.w1 = (rand.random(D_nn + 1) - 0.5) * init_nn 114 | self.z = np.zeros((D_nn,), dtype=np.float64) 115 | self.c0 = np.zeros((D,), dtype=np.float64) 116 | self.c1 = np.zeros((D_nn,), dtype=np.float64) 117 | 118 | def predict(self, X, int threads= 0): 119 | if threads==0: threads= self.threads 120 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 121 | if X.shape[1] != self.D: 122 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 123 | return self.predict_f(X.data, X.indices, X.indptr, threads) 124 | 125 | def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 126 | np.ndarray[int, ndim=1, mode='c'] X_indices, 127 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads): 128 | cdef double alpha= self.alpha, L2= self.L2 129 | p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64) 130 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0] 131 | cdef double[:] pp= p 132 | cdef int lenn, D= self.D, D_nn= self.D_nn, row_count= X_indptr.shape[0]-1, row, ptr 133 | for row in range(row_count): 134 | ptr= X_indptr[row] 135 | lenn= X_indptr[row + 1] - ptr 136 | inds= X_indices.data + ptr 137 | vals= X_data.data + ptr 138 | pp[row]= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, w0, w1, z, threads), self.inv_link) 139 | return p 140 | 141 | def partial_fit(self, X, y, sample_weight= None, int threads = 0, int seed = 0): 142 | return self.fit(X, y, sample_weight= sample_weight, threads = threads, seed = seed, reset= False) 143 | 144 | def fit(self, X, y, sample_weight= None, int threads= 0, int seed= 0, reset= True): 145 | if threads == 0: threads= self.threads 146 | if type(X) != ssp.csr.csr_matrix: X = ssp.csr_matrix(X, dtype=np.float64) 147 | if reset or self.D==0: 148 | self.D= X.shape[1] 149 | self.reset() 150 | elif X.shape[1] != self.D: 151 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 152 | if type(y) != np.array: y = np.array(y, dtype=np.float64) 153 | # self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices), 154 | # np.ascontiguousarray(X.indptr), y, threads) 155 | if sample_weight is not None and type(sample_weight) != np.array: 156 | sample_weight= np.array(sample_weight, dtype=np.float64) 157 | return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads, seed) 158 | 159 | def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 160 | np.ndarray[int, ndim=1, mode='c'] X_indices, 161 | np.ndarray[int, ndim=1, mode='c'] X_indptr, 162 | np.ndarray[double, ndim=1, mode='c'] y, 163 | sample_weight, 164 | int threads, int seed): 165 | cdef double alpha= self.alpha, L2= self.L2, e_noise= self.e_noise, e, e_total= 0, e_clip= self.e_clip, abs_e 166 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0], *c0= &self.c0[0], *c1= &self.c1[0] 167 | cdef double *ys= y.data 168 | cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, ptr, row_count= X_indptr.shape[0]-1, row, \ 169 | inv_link= self.inv_link, j=0, jj 170 | cdef int* inds, indptr 171 | cdef double* vals 172 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 173 | for iter in range(self.iters): 174 | e_total= 0.0 175 | for row in range(row_count): 176 | ptr= X_indptr[row] 177 | lenn= X_indptr[row+1]-ptr 178 | inds= X_indices.data+ptr 179 | vals= X_data.data+ptr 180 | e= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, w0, w1, z, threads), self.inv_link) -ys[row] 181 | abs_e= fabs(e) 182 | e_total+= abs_e 183 | e += (rand.random() - 0.5) * e_noise 184 | if abs_e> e_clip: 185 | if e>0: e= e_clip 186 | else: e= -e_clip 187 | if sample_weight is not None: 188 | e*= sample_weight[row] 189 | update_single(inds, vals, lenn, D, D_nn, e, alpha, L2, w0, w1, z, c0, c1, threads) 190 | if self.verbose > 0: print "Total e:", e_total 191 | return self 192 | 193 | def predict_layer(self, X, int layer, int threads= 0): 194 | if threads==0: threads= self.threads 195 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 196 | return self.predict_layer_f(X.data, X.indices, X.indptr, layer, threads) 197 | 198 | def predict_layer_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 199 | np.ndarray[int, ndim=1, mode='c'] X_indices, 200 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int layer, int threads): 201 | cdef double alpha= self.alpha, L2= self.L2 202 | p = np.zeros(((X_indptr.shape[0] - 1), self.D_nn), dtype=np.float64) 203 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0] 204 | cdef double[:,:] pp= p 205 | cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, row_count= X_indptr.shape[0]-1, row, ptr 206 | for row in range(row_count): 207 | ptr= X_indptr[row] 208 | lenn= X_indptr[row + 1] - ptr 209 | inds= X_indices.data + ptr 210 | vals= X_data.data + ptr 211 | predict_single(inds, vals, lenn, D, D_nn, w0, w1, &pp[row][0], threads) 212 | return p 213 | 214 | def __getstate__(self): 215 | return (self.alpha, 216 | self.L2, 217 | self.e_noise, 218 | self.e_clip, 219 | self.init_nn, 220 | self.D, 221 | self.D_nn, 222 | self.iters, 223 | self.threads, 224 | np.asarray(self.w0), 225 | np.asarray(self.w1), 226 | np.asarray(self.z), 227 | np.asarray(self.c0), 228 | np.asarray(self.c1), 229 | self.inv_link, 230 | self.seed, 231 | self.verbose) 232 | 233 | def __setstate__(self, params): 234 | (self.alpha, 235 | self.L2, 236 | self.e_noise, 237 | self.e_clip, 238 | self.init_nn, 239 | self.D, 240 | self.D_nn, 241 | self.iters, 242 | self.threads, 243 | self.w0, 244 | self.w1, 245 | self.z, 246 | self.c0, 247 | self.c1, 248 | self.inv_link, 249 | self.seed, 250 | self.verbose)= params 251 | -------------------------------------------------------------------------------- /wordbatch/models/nn_relu_h2.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False, cdivision=True 2 | import numpy as np 3 | import gzip 4 | cimport cython 5 | from cpython cimport array 6 | import scipy.sparse as ssp 7 | cimport numpy as np 8 | from cython.parallel import prange 9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs 10 | import multiprocessing 11 | import sys 12 | import randomgen 13 | 14 | np.import_array() 15 | 16 | cdef double inv_link_f(double e, int inv_link) nogil: 17 | if inv_link==1: return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss 18 | return e 19 | 20 | cdef double predict_single(int* inds, double* vals, int lenn, int D, int D_nn, int D_nn2, 21 | double* w0, double* w1, double* w2, double* z1, double* z2, int threads) nogil: 22 | cdef int i, ii, j, k, DD_nn= D*D_nn, DD_nn2= D_nn*D_nn2 23 | cdef double p, v, z1j, z2k 24 | p= w2[D_nn2] 25 | for k in prange(D_nn2, nogil=True, num_threads= threads): 26 | #for k in range(D_nn2): 27 | z2k= w1[DD_nn2+k] 28 | for j in range(D_nn): 29 | z1j= w0[DD_nn+j] 30 | for ii in range(lenn): z1j= z1j+ w0[inds[ii]*D_nn+j] * vals[ii] 31 | if z1j<0: z1[j]= 0 32 | else: 33 | z1[j] = z1j 34 | z2k= z2k+ w1[j * D_nn2+k] * z1j 35 | if z2k<0: z2[k]= 0 36 | else: 37 | z2[k]= z2k 38 | p+= w2[k] * z2k 39 | return p 40 | 41 | cdef void update_single(int* inds, double* vals, int lenn, int D, int D_nn, int D_nn2, 42 | double e, double alpha, double L2, double* w0, double* w1, double* w2, double* z1, 43 | double* z2, double* c0, double* c1, double* c2, int threads) nogil: 44 | cdef int i, ii, j, k, DD_nn= D*D_nn, DD_nn2= D_nn*D_nn2 45 | cdef double dldy= e, dldz1, dldz2, dldw0, dldw1, dldw2 46 | w2[D_nn2]-= (dldy + L2 *w2[D_nn2]) * alpha 47 | for k in range(D_nn2): 48 | if z2[k]==0: continue 49 | dldw2= dldy * z2[k] 50 | w2[k]-= (dldw2 + L2 * w2[k]) * alpha / (sqrt(c2[k])+1) 51 | dldz2= dldy * w2[k] 52 | w1[DD_nn2+k]-= (dldz2 + L2 * w1[DD_nn2 + k]) * alpha / (sqrt(c2[k])+1) 53 | for j in range(D_nn): 54 | if z1[j]==0: continue 55 | dldw1= dldz2 * z1[j] 56 | w1[j*D_nn2+k]-= (dldw1 + L2 * w1[j]) * alpha / (sqrt(c1[j])+1) 57 | dldz1= dldz2 * w1[j*D_nn2+k] 58 | w0[DD_nn+j]-= (dldz1 + L2 * w0[DD_nn+j]) * alpha / (sqrt(c1[j])+1) 59 | for ii in prange(lenn, nogil=True, num_threads= threads): 60 | i= inds[ii] 61 | dldw0= dldz1 * vals[ii] 62 | w0[i*D_nn+j]-= (dldw0 + L2 * w0[i * D_nn + j]) * alpha/(sqrt(c0[i])+1) 63 | c0[i] += fabs(dldw0) 64 | c1[j] += fabs(dldw1) 65 | c2[k] += fabs(dldw2) 66 | 67 | cdef class NN_ReLU_H2: 68 | cdef const double[:] w0 69 | cdef const double[:] w1 70 | cdef const double[:] w2 71 | cdef const double[:] z1 72 | cdef const double[:] z2 73 | cdef const double[:] c0 74 | cdef const double[:] c1 75 | cdef const double[:] c2 76 | 77 | cdef unsigned int threads 78 | cdef unsigned int iters 79 | cdef int D 80 | cdef int D_nn 81 | cdef int D_nn2 82 | cdef double init_nn 83 | 84 | cdef double L2 85 | cdef double alpha 86 | cdef double e_noise 87 | cdef double e_clip 88 | cdef int inv_link 89 | cdef int seed 90 | cdef int verbose 91 | 92 | def __init__(self, 93 | double alpha=0.1, 94 | double L2=0.00001, 95 | int D=0, 96 | int D_nn=12, 97 | int D_nn2=4, 98 | double init_nn=0.01, 99 | double e_noise=0.001, 100 | double e_clip=1.0, 101 | unsigned int iters=3, 102 | inv_link= "identity", 103 | int threads= 0, 104 | int seed= 0, 105 | int verbose=1): 106 | 107 | self.alpha= alpha 108 | self.L2= L2 109 | self.e_noise= e_noise 110 | self.D= D 111 | self.D_nn= D_nn 112 | self.D_nn2= D_nn2 113 | self.init_nn= init_nn 114 | self.e_noise = e_noise 115 | self.e_clip = e_clip 116 | self.iters= iters 117 | if threads==0: threads= multiprocessing.cpu_count()-1 118 | self.threads= threads 119 | if inv_link=="sigmoid": self.inv_link= 1 120 | if inv_link=="identity": self.inv_link= 0 121 | self.seed = seed 122 | self.verbose = verbose 123 | self.reset() 124 | 125 | def reset(self): 126 | init_nn= self.init_nn 127 | D= self.D 128 | D_nn = self.D_nn 129 | D_nn2 = self.D_nn2 130 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 131 | self.w0 = (rand.random((D + 1) * D_nn) - 0.5) * init_nn 132 | self.w1 = (rand.random((D_nn + 1) * D_nn2) - 0.3) * init_nn 133 | self.w2 = (rand.rand(D_nn2 + 1) - 0.5) * init_nn 134 | self.z1= np.zeros((D_nn,), dtype=np.float64) 135 | self.z2= np.zeros((D_nn2,), dtype=np.float64) 136 | self.c0= np.zeros((D,), dtype=np.float64) 137 | self.c1= np.zeros((D_nn,), dtype=np.float64) 138 | self.c2= np.zeros((D_nn2,), dtype=np.float64) 139 | 140 | def predict(self, X, int threads= 0): 141 | if threads==0: threads= self.threads 142 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 143 | if X.shape[1] != self.D: 144 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 145 | return self.predict_f(X.data, X.indices, X.indptr, threads) 146 | 147 | def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 148 | np.ndarray[int, ndim=1, mode='c'] X_indices, 149 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads): 150 | cdef double alpha= self.alpha, L2= self.L2 151 | p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64) 152 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0] 153 | cdef double[:] pp= p 154 | cdef int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, row_count= X_indptr.shape[0]-1, row, ptr 155 | for row in range(row_count): 156 | ptr= X_indptr[row] 157 | lenn= X_indptr[row + 1] - ptr 158 | inds= X_indices.data + ptr 159 | vals= X_data.data + ptr 160 | pp[row]= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, z2, threads), \ 161 | self.inv_link) 162 | return p 163 | 164 | def partial_fit(self, X, y, int threads = 0, int seed = 0): 165 | return self.fit(X, y, threads=threads, seed=seed, reset=False) 166 | 167 | def fit(self, X, y, int threads= 0, int seed= 0, reset=True): 168 | if threads == 0: threads= self.threads 169 | if type(X) != ssp.csr.csr_matrix: X = ssp.csr_matrix(X, dtype=np.float64) 170 | if reset or self.D==0: 171 | self.D= X.shape[1] 172 | self.reset() 173 | elif X.shape[1] != self.D: 174 | print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1]) 175 | if type(y) != np.array: y = np.array(y, dtype=np.float64) 176 | return self.fit_f(X.data, X.indices, X.indptr, y, threads, seed) 177 | 178 | def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 179 | np.ndarray[int, ndim=1, mode='c'] X_indices, 180 | np.ndarray[int, ndim=1, mode='c'] X_indptr, 181 | np.ndarray[double, ndim=1, mode='c'] y, int threads, int seed): 182 | cdef double alpha= self.alpha, L2= self.L2, e_noise= self.e_noise, e, e_total= 0, e_clip= self.e_clip, abs_e 183 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0], \ 184 | *c0= &self.c0[0], *c1= &self.c1[0], *c2= &self.c2[0] 185 | cdef double *ys= y.data 186 | cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, ptr, row_count= X_indptr.shape[0]-1, \ 187 | row, inv_link= self.inv_link, j=0, jj 188 | cdef int* inds, indptr 189 | cdef double* vals 190 | rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed)) 191 | for iter in range(self.iters): 192 | e_total= 0.0 193 | for row in range(row_count): 194 | ptr= X_indptr[row] 195 | lenn= X_indptr[row+1]-ptr 196 | inds= X_indices.data+ptr 197 | vals= X_data.data+ptr 198 | e= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, z2, threads), \ 199 | self.inv_link) -ys[row] 200 | abs_e= fabs(e) 201 | e_total+= abs_e 202 | e += (rand.rand() - 0.5) * e_noise 203 | if abs_e> e_clip: 204 | if e>0: e= e_clip 205 | else: e= -e_clip 206 | update_single(inds, vals, lenn, D, D_nn, D_nn2, e, alpha, L2, w0, w1, w2, z1, z2, c0, c1, c2, threads) 207 | if self.verbose > 0: print "Total e:", e_total 208 | return self 209 | 210 | def predict_layer(self, X, int layer, int threads= 0): 211 | if threads==0: threads= self.threads 212 | if type(X) != ssp.csr.csr_matrix: X= ssp.csr_matrix(X, dtype=np.float64) 213 | return self.predict_layer_f(X.data, X.indices, X.indptr, layer, threads) 214 | 215 | def predict_layer_f(self, np.ndarray[double, ndim=1, mode='c'] X_data, 216 | np.ndarray[int, ndim=1, mode='c'] X_indices, 217 | np.ndarray[int, ndim=1, mode='c'] X_indptr, int layer, int threads): 218 | cdef double alpha= self.alpha, L2= self.L2 219 | cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0] 220 | cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, row_count= X_indptr.shape[0]-1, row, ptr 221 | if layer==1: p= np.zeros(((X_indptr.shape[0]-1),self.D_nn), dtype= np.float64) 222 | else: p= np.zeros(((X_indptr.shape[0]-1),self.D_nn2), dtype= np.float64) 223 | cdef double[:,:] pp= p 224 | for row in range(row_count): 225 | ptr= X_indptr[row] 226 | lenn= X_indptr[row + 1] - ptr 227 | inds= X_indices.data + ptr 228 | vals= X_data.data + ptr 229 | if layer==1: predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, &pp[row][0], z2, threads) 230 | else: predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, &pp[row][0], threads) 231 | return p 232 | 233 | def __getstate__(self): 234 | return (self.alpha, 235 | self.L2, 236 | self.e_noise, 237 | self.e_clip, 238 | self.init_nn, 239 | self.D, 240 | self.D_nn, 241 | self.D_nn2, 242 | self.iters, 243 | self.threads, 244 | np.asarray(self.w1), 245 | np.asarray(self.w2), 246 | np.asarray(self.z1), 247 | np.asarray(self.z2), 248 | np.asarray(self.c0), 249 | np.asarray(self.c1), 250 | np.asarray(self.c2), 251 | self.inv_link, 252 | self.seed, 253 | self.verbose) 254 | 255 | def __setstate__(self, params): 256 | (self.alpha, 257 | self.L2, 258 | self.e_noise, 259 | self.e_clip, 260 | self.init_nn, 261 | self.D, 262 | self.D_nn, 263 | self.D_nn2, 264 | self.iters, 265 | self.threads, 266 | self.w1, 267 | self.w2, 268 | self.z1, 269 | self.z2, 270 | self.c0, 271 | self.c1, 272 | self.c2, 273 | self.inv_link, 274 | self.seed, 275 | self.verbose) = params 276 | -------------------------------------------------------------------------------- /wordbatch/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .apply import Apply, decorator_apply 2 | from .apply_batch import ApplyBatch, decorator_apply_batch 3 | from .apply_groupby import ApplyGroupBy, decorator_apply_groupby 4 | from .batch_transformer import BatchTransformer 5 | from .feature_union import FeatureUnion 6 | from .wordbatch import WordBatch -------------------------------------------------------------------------------- /wordbatch/pipelines/apply.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import pandas as pd 7 | import wordbatch.batcher 8 | 9 | def decorator_apply(func, batcher=None, cache=None, vectorize=None): 10 | def wrapper_func(*args, **kwargs): 11 | return Apply(func, args=args[1:], kwargs=kwargs, batcher=batcher, cache=cache, vectorize=vectorize)\ 12 | .transform(args[0]) 13 | return wrapper_func 14 | 15 | def batch_transform(args): 16 | f= args[1] 17 | f_args= args[2] 18 | f_kwargs= args[3] 19 | if args[5] is not None: 20 | from numba import vectorize 21 | return vectorize(args[5], fastmath=True)(f)(*zip(*args[0])) 22 | if args[4] is not None: 23 | from functools import lru_cache 24 | f= lru_cache(maxsize=args[4])(f) 25 | #Applying per DataFrame row is very slow, use ApplyBatch instead 26 | if isinstance(args[0], pd.DataFrame): return args[0].apply(lambda x: f(x, *f_args, **f_kwargs), axis=1) 27 | return [f(row, *f_args, **f_kwargs) for row in args[0]] 28 | 29 | class Apply(object): 30 | #Applies a function to each row of a minibatch 31 | def __init__(self, function, batcher=None, args=[], kwargs={}, cache=None, vectorize=None): 32 | if batcher is None: self.batcher= wordbatch.batcher.Batcher() 33 | else: self.batcher= batcher 34 | self.function= function 35 | self.args= [args] 36 | self.kwargs= [kwargs] 37 | self.cache = [cache] 38 | self.vectorize = [vectorize] 39 | 40 | def fit(self, data, input_split= False, batcher= None): 41 | return self 42 | 43 | def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 44 | return self.transform(data, input_split, merge_output, minibatch_size, batcher) 45 | 46 | def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 47 | if batcher is None: batcher = self.batcher 48 | return batcher.process_batches(batch_transform, data, 49 | [self.function] + self.args + self.kwargs + self.cache + self.vectorize, 50 | input_split=input_split, merge_output=merge_output, 51 | minibatch_size= minibatch_size) 52 | 53 | # import wordbatch.batcher as batcher 54 | # b= batcher.Batcher(minibatch_size=2)#, method="serial") 55 | # import numpy as np 56 | # a= Apply(np.power, b, [2],{}) 57 | # print(a.transform([1, 2, 3, 4])) -------------------------------------------------------------------------------- /wordbatch/pipelines/apply_batch.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import wordbatch.batcher 7 | 8 | def decorator_apply_batch(func, batcher=None): 9 | def wrapper_func(*args, **kwargs): 10 | return ApplyBatch(func, args=args[1:], kwargs= kwargs, batcher= batcher).transform(args[0]) 11 | return wrapper_func 12 | 13 | def batch_transform(args): 14 | f= args[1] 15 | f_args= args[2] 16 | f_kwargs= args[3] 17 | return f(args[0], *f_args, **f_kwargs) 18 | 19 | class ApplyBatch(object): 20 | #Applies a function to the entire minibatch. Use this for example on Pandas dataframes, to avoid per-row overhead. 21 | #Function needs to be applicable to the array/list of values! 22 | #If not, modify/wrap the function to process a list, or use Apply 23 | def __init__(self, function, batcher=None, args=[], kwargs={}): 24 | if batcher is None: self.batcher = wordbatch.batcher.Batcher() 25 | else: self.batcher = batcher 26 | self.function= function 27 | self.args= [args] 28 | self.kwargs= [kwargs] 29 | 30 | def fit(self, data, input_split= False): 31 | return self 32 | 33 | def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 34 | return self.transform(data, input_split, merge_output, minibatch_size, batcher) 35 | 36 | def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 37 | if batcher is None: batcher = self.batcher 38 | return batcher.process_batches(batch_transform, data, [self.function] + self.args + self.kwargs, 39 | input_split=input_split, merge_output=merge_output, 40 | minibatch_size= minibatch_size) 41 | 42 | # import wordbatch.batcher as batcher 43 | # b= batcher.Batcher(minibatch_size=2)#, method="serial") 44 | # import numpy as np 45 | # a= ApplyBatch(np.power, b, [2],{}) 46 | # print(a.transform([1, 2, 3, 4])) -------------------------------------------------------------------------------- /wordbatch/pipelines/apply_groupby.py: -------------------------------------------------------------------------------- 1 | #!python 2 | import pandas as pd 3 | from wordbatch.pipelines import Apply 4 | import wordbatch.batcher 5 | 6 | def decorator_apply_groupby(func, group, batcher=None, rows_per_bin=200, cache=None, vectorize=None): 7 | def wrapper_func(*args, **kwargs): 8 | return ApplyGroupBy(func, args=args[1:], kwargs=kwargs, group=group, rows_per_bin=rows_per_bin, 9 | batcher=batcher, cache=cache, vectorize=vectorize).transform(args[0]) 10 | return wrapper_func 11 | 12 | class ApplyGroupBy(object): 13 | def __init__(self, function, group, batcher=None, rows_per_bin= 200, cache=None, vectorize=None, args=[], 14 | kwargs={}): 15 | if batcher is None: self.batcher= wordbatch.batcher.Batcher() 16 | else: self.batcher= batcher 17 | self.function= function 18 | self.group= group 19 | self.rows_per_bin = rows_per_bin 20 | self.cache= cache 21 | self.vectorize= vectorize 22 | self.args= [args] 23 | self.kwargs= [kwargs] 24 | 25 | def fit(self, data, input_split= False): 26 | return self 27 | 28 | def fit_transform(self, data, input_split= False, merge_output= True): 29 | return self.transform(data, input_split, merge_output) 30 | 31 | def transform(self, data, input_split= False, merge_output= True): 32 | bin_ids = data[self.group].unique() 33 | group_bins= {x:1 for x in bin_ids} if len(bin_ids) <= self.rows_per_bin else \ 34 | {x[0]: x[1] for x in zip(bin_ids, pd.qcut(bin_ids, len(bin_ids) // self.rows_per_bin))} 35 | group_bin_col = data[self.group].map(group_bins) 36 | bin_ids, groups = zip(*data.groupby(group_bin_col, as_index=False)) 37 | t= [x for x in Apply(self.function, self.batcher, *self.args, *self.kwargs, self.cache, 38 | self.vectorize).transform(groups, input_split, merge_output) 39 | if len(x) > 0] 40 | try: 41 | t= pd.concat(t, sort=False) # t is Series or DataFrame 42 | except: 43 | t= [item for sublist in t for item in sublist] # t is some iterable 44 | return t -------------------------------------------------------------------------------- /wordbatch/pipelines/batch_transformer.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import wordbatch.batcher 7 | 8 | def batch_transform(args): 9 | return args[1].transform(args[0]) 10 | 11 | class BatchTransformer(object): 12 | def __init__(self, transformer, call_fit=True, batcher=None): 13 | if batcher is None: self.batcher = wordbatch.batcher.Batcher() 14 | else: self.batcher = batcher 15 | self.transformer= transformer 16 | self.call_fit= call_fit 17 | 18 | def fit(self, data, input_split=False, batcher=None): 19 | if batcher is None: batcher = self.batcher 20 | if self.call_fit: 21 | if input_split: self.transformer.fit(batcher.merge_batches(batcher.collect_batches(data))) 22 | else: self.transformer.fit(data) 23 | return self 24 | 25 | def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 26 | if self.call_fit: self.fit(data, input_split= input_split) 27 | return self.transform(data, input_split, merge_output, minibatch_size= minibatch_size, batcher=batcher) 28 | 29 | def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None): 30 | if batcher is None: batcher = self.batcher 31 | return batcher.process_batches(batch_transform, data, [self.transformer], 32 | input_split=input_split, merge_output=merge_output, 33 | minibatch_size= minibatch_size) 34 | -------------------------------------------------------------------------------- /wordbatch/pipelines/feature_union.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.pool import Pool 2 | 3 | import numpy as np 4 | from scipy import sparse 5 | from sklearn.base import TransformerMixin 6 | from sklearn.pipeline import FeatureUnion, _fit_one, _fit_transform_one, _transform_one, _name_estimators 7 | from sklearn.utils.metaestimators import _BaseComposition 8 | from wordbatch.pipelines import Apply 9 | 10 | def fit_one(args): 11 | X, y, transformer, fit_params = args 12 | return transformer.fit(X, y, **fit_params) 13 | 14 | def transform_one(args): 15 | X, transformer= args 16 | return transformer.transform(X) 17 | 18 | def fit_transform_one(args): 19 | X, y, transformer, fit_params = args 20 | return transformer.fit_transform(X, y, **fit_params), transformer 21 | 22 | class FeatureUnion(_BaseComposition, TransformerMixin): 23 | """Concatenates results of multiple transformer objects. 24 | 25 | This estimator applies a list of transformer objects in parallel to the 26 | input data, then concatenates the results. This is useful to combine 27 | several feature extraction mechanisms into a single transformer. 28 | 29 | Parameters of the transformers may be set using its name and the parameter 30 | name separated by a '__'. A transformer may be replaced entirely by 31 | setting the parameter with its name to another transformer, 32 | or removed by setting to 'drop' or ``None``. 33 | 34 | Read more in the :ref:`User Guide `. 35 | 36 | Parameters 37 | ---------- 38 | transformer_list : list of (string, transformer) tuples 39 | List of transformer objects to be applied to the data. The first 40 | half of each tuple is the name of the transformer. 41 | 42 | n_jobs : int or None, optional (default=None) 43 | Number of jobs to run in parallel. 44 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 45 | ``-1`` means using all processors. See :term:`Glossary ` 46 | for more details. 47 | 48 | transformer_weights : dict, optional 49 | Multiplicative weights for features per transformer. 50 | Keys are transformer names, values the weights. 51 | 52 | verbose : boolean, optional(default=False) 53 | If True, the time elapsed while fitting each transformer will be 54 | printed as it is completed. 55 | 56 | See also 57 | -------- 58 | sklearn.pipeline.make_union : convenience function for simplified 59 | feature union construction. 60 | 61 | Examples 62 | -------- 63 | >>> from sklearn.pipeline import FeatureUnion 64 | >>> from sklearn.decomposition import PCA, TruncatedSVD 65 | >>> union = FeatureUnion([("pca", PCA(n_components=1)), 66 | ... ("svd", TruncatedSVD(n_components=2))]) 67 | >>> X = [[0., 1., 3], [2., 2., 5]] 68 | >>> union.fit_transform(X) 69 | array([[ 1.5 , 3.0..., 0.8...], 70 | [-1.5 , 5.7..., -0.4...]]) 71 | """ 72 | _required_parameters = ["transformer_list"] 73 | 74 | def __init__(self, transformer_list, transformer_weights=None, batcher=None, concatenate=True): 75 | self.transformer_list = transformer_list 76 | self.transformer_weights = transformer_weights 77 | self._validate_transformers() 78 | self.batcher = batcher 79 | self.concatenate= concatenate 80 | 81 | def get_params(self, deep=True): 82 | """Get parameters for this estimator. 83 | 84 | Parameters 85 | ---------- 86 | deep : boolean, optional 87 | If True, will return the parameters for this estimator and 88 | contained subobjects that are estimators. 89 | 90 | Returns 91 | ------- 92 | params : mapping of string to any 93 | Parameter names mapped to their values. 94 | """ 95 | return self._get_params('transformer_list', deep=deep) 96 | 97 | def set_params(self, **kwargs): 98 | """Set the parameters of this estimator. 99 | 100 | Valid parameter keys can be listed with ``get_params()``. 101 | 102 | Returns 103 | ------- 104 | self 105 | """ 106 | self._set_params('transformer_list', **kwargs) 107 | return self 108 | 109 | def _validate_transformers(self): 110 | names, transformers = zip(*self.transformer_list) 111 | 112 | # validate names 113 | self._validate_names(names) 114 | 115 | # validate estimators 116 | for t in transformers: 117 | if t is None or t == 'drop': 118 | continue 119 | if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not 120 | hasattr(t, "transform")): 121 | raise TypeError("All estimators should implement fit and " 122 | "transform. '%s' (type %s) doesn't" % 123 | (t, type(t))) 124 | 125 | def _iter(self): 126 | """ 127 | Generate (name, trans, weight) tuples excluding None and 128 | 'drop' transformers. 129 | """ 130 | get_weight = (self.transformer_weights or {}).get 131 | return ((name, trans, get_weight(name)) 132 | for name, trans in self.transformer_list 133 | if trans is not None and trans != 'drop') 134 | 135 | def get_feature_names(self): 136 | """Get feature names from all transformers. 137 | 138 | Returns 139 | ------- 140 | feature_names : list of strings 141 | Names of the features produced by transform. 142 | """ 143 | feature_names = [] 144 | for name, trans, weight in self._iter(): 145 | if not hasattr(trans, 'get_feature_names'): 146 | raise AttributeError("Transformer %s (type %s) does not " 147 | "provide get_feature_names." 148 | % (str(name), type(trans).__name__)) 149 | feature_names.extend([name + "__" + f for f in 150 | trans.get_feature_names()]) 151 | return feature_names 152 | 153 | def fit(self, X, y=None, **fit_params): 154 | """Fit all transformers using X. 155 | 156 | Parameters 157 | ---------- 158 | X : iterable or array-like, depending on transformers 159 | Input data, used to fit transformers. 160 | 161 | y : array-like, shape (n_samples, ...), optional 162 | Targets for supervised learning. 163 | 164 | Returns 165 | ------- 166 | self : FeatureUnion 167 | This estimator 168 | """ 169 | self.transformer_list = list(self.transformer_list) 170 | self._validate_transformers() 171 | paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, y, t, fit_params] 172 | for _, t, _ in self._iter()] 173 | transformers= Apply(fit_one, self.batcher).transform(paral_params) 174 | # with Pool(self.n_jobs) as pool: 175 | # transformers = pool.starmap(_fit_one, 176 | # ((trans, X[trans['col_pick']] if hasattr(trans, 'col_pick') else X, y) for _, trans, _ in self._iter())) 177 | self._update_transformer_list(transformers) 178 | return self 179 | 180 | def fit_transform(self, X, y=None, **fit_params): 181 | """Fit all transformers, transform the data and concatenate results. 182 | 183 | Parameters 184 | ---------- 185 | X : iterable or array-like, depending on transformers 186 | Input data to be transformed. 187 | 188 | y : array-like, shape (n_samples, ...), optional 189 | Targets for supervised learning. 190 | 191 | Returns 192 | ------- 193 | X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) 194 | hstack of results of transformers. sum_n_components is the 195 | sum of n_components (output dimension) over transformers. 196 | """ 197 | self._validate_transformers() 198 | paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, y, t, fit_params] 199 | for _, t, _ in self._iter()] 200 | result = Apply(fit_transform_one, self.batcher).transform(paral_params) 201 | if not result: 202 | # All transformers are None 203 | return np.zeros((X.shape[0], 0)) 204 | Xs, transformers = zip(*result) 205 | self._update_transformer_list(transformers) 206 | if self.concatenate: 207 | if any(sparse.issparse(f) for f in Xs): 208 | Xs = sparse.hstack(Xs).tocsr() 209 | else: 210 | Xs = np.hstack(Xs) 211 | return Xs 212 | 213 | def transform(self, X): 214 | """Transform X separately by each transformer, concatenate results. 215 | 216 | Parameters 217 | ---------- 218 | X : iterable or array-like, depending on transformers 219 | Input data to be transformed. 220 | 221 | Returns 222 | ------- 223 | X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) 224 | hstack of results of transformers. sum_n_components is the 225 | sum of n_components (output dimension) over transformers. 226 | """ 227 | paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, t] for _, t, _ in self._iter()] 228 | Xs = Apply(transform_one, self.batcher).transform(paral_params) 229 | if not Xs: 230 | # All transformers are None 231 | return np.zeros((X.shape[0], 0)) 232 | if self.concatenate: 233 | if any(sparse.issparse(f) for f in Xs): 234 | Xs = sparse.hstack(Xs).tocsr() 235 | else: 236 | Xs = np.hstack(Xs) 237 | return Xs 238 | 239 | def _update_transformer_list(self, transformers): 240 | transformers = iter(transformers) 241 | self.transformer_list[:] = [(name, None if old is None or old == 'drop' else next(transformers)) 242 | for name, old in self.transformer_list 243 | ] 244 | 245 | 246 | def make_union(*transformers, **kwargs): 247 | """Construct a FeatureUnion from the given transformers. 248 | 249 | This is a shorthand for the FeatureUnion constructor; it does not require, 250 | and does not permit, naming the transformers. Instead, they will be given 251 | names automatically based on their types. It also does not allow weighting. 252 | 253 | Parameters 254 | ---------- 255 | *transformers : list of estimators 256 | 257 | n_jobs : int, optional 258 | Number of jobs to run in parallel (default 1). 259 | 260 | Returns 261 | ------- 262 | f : FeatureUnion 263 | 264 | Examples 265 | -------- 266 | >>> from sklearn.decomposition import PCA, TruncatedSVD 267 | >>> from sklearn.pipeline import make_union 268 | >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE 269 | FeatureUnion(n_jobs=1, 270 | transformer_list=[('pca', 271 | PCA(copy=True, iterated_power='auto', 272 | n_components=None, random_state=None, 273 | svd_solver='auto', tol=0.0, whiten=False)), 274 | ('truncatedsvd', 275 | TruncatedSVD(algorithm='randomized', 276 | n_components=2, n_iter=5, 277 | random_state=None, tol=0.0))], 278 | transformer_weights=None) 279 | """ 280 | n_jobs = kwargs.pop('n_jobs', 1) 281 | concatenate = kwargs.pop('concatenate', True) 282 | if kwargs: 283 | # We do not currently support `transformer_weights` as we may want to 284 | # change its type spec in make_union 285 | raise TypeError('Unknown keyword arguments: "{}"' 286 | .format(list(kwargs.keys())[0])) 287 | return FeatureUnion(_name_estimators(transformers), n_jobs= n_jobs, concatenate= concatenate) 288 | -------------------------------------------------------------------------------- /wordbatch/pipelines/wordbatch.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import os 7 | import wordbatch.batcher 8 | import wordbatch.pipelines 9 | import wordbatch.transformers 10 | 11 | class WordBatch(object): 12 | def __init__(self, normalize_text="new", dictionary="new", tokenizer=None, extractor=None, 13 | freeze=False, fit_extractor= False, batcher=None, verbose=0): 14 | self.verbose= verbose 15 | if batcher is None: self.batcher= wordbatch.batcher.Batcher(verbose=verbose) 16 | else: self.batcher= batcher 17 | 18 | if normalize_text is None: self.normalize_text= None 19 | elif normalize_text== "new": self.normalize_text= wordbatch.transformers.text_normalizer.TextNormalizer() 20 | elif callable(normalize_text): 21 | self.normalize_text= wordbatch.pipelines.apply.Apply(normalize_text, batcher=batcher) 22 | else: self.normalize_text= normalize_text 23 | 24 | if tokenizer is None: self.tokenizer= None 25 | else: self.tokenizer= tokenizer 26 | 27 | if dictionary is None: self.dictionary= None 28 | elif dictionary== "new": self.dictionary= wordbatch.transformers.dictionary.Dictionary() 29 | else: self.dictionary= dictionary 30 | 31 | if extractor is None: self.extractor= None 32 | else: self.extractor= wordbatch.pipelines.batch_transformer.BatchTransformer(extractor, batcher=batcher, 33 | call_fit=fit_extractor) 34 | if self.extractor is not None: 35 | if hasattr(self.extractor.transformer, "dictionary"): 36 | self.extractor.transformer.dictionary = self.dictionary 37 | self.freeze= freeze 38 | 39 | def reset(self): 40 | if self.dictionary is not None: self.dictionary.reset() 41 | return self 42 | 43 | def process(self, data, y=None, input_split=False, reset=True, update=True, minibatch_size=None, batcher= None): 44 | if batcher is None: batcher = self.batcher 45 | if reset: self.reset() 46 | if self.freeze: update= False 47 | 48 | if self.normalize_text is not None: 49 | if self.verbose > 0: print("Normalize text") 50 | data= self.normalize_text.transform(data, input_split=input_split, merge_output=False, 51 | minibatch_size= minibatch_size, batcher= self.batcher) 52 | input_split= True 53 | if self.tokenizer is not None: 54 | if self.verbose > 0: print("Tokenize text") 55 | if update: data= self.tokenizer.fit_transform(data, y=y, input_split=input_split, merge_output=False, 56 | reset=reset, minibatch_size= minibatch_size, 57 | batcher=self.batcher) 58 | else: data= self.tokenizer.transform(data, y=y, input_split=input_split, merge_output=False, 59 | minibatch_size= minibatch_size, batcher=self.batcher) 60 | input_split= True 61 | if self.dictionary is not None: 62 | if self.verbose > 0: print("Use dictionary") 63 | if update: data= self.dictionary.fit_transform(data, y=y, input_split=input_split, merge_output=False, 64 | reset=reset, minibatch_size= minibatch_size, batcher=self.batcher) 65 | else: data= self.dictionary.transform(data, y=y, input_split=input_split, merge_output=False, 66 | minibatch_size= minibatch_size, batcher=self.batcher) 67 | if self.verbose> 2: print("len(self.dictionary.dft):", len(self.dictionary.dft)) 68 | return data 69 | 70 | def fit(self, data, y=None, input_split=False, reset=True, minibatch_size=None, batcher= None): 71 | if batcher is None: batcher = self.batcher 72 | self.process(data, y, input_split, reset=reset, update= True, minibatch_size= minibatch_size, batcher= batcher) 73 | if self.extractor is not None: 74 | self.extractor.fit(data, input_split=input_split) 75 | return self 76 | 77 | def transform(self, data, y=None, cache_features=None, input_split=False, reset=False, update=False, 78 | minibatch_size=None, batcher= None): 79 | if batcher is None: batcher = self.batcher 80 | if cache_features is not None: 81 | if self.extractor is not None and os.path.exists(cache_features) and \ 82 | hasattr(self.extractor.transformer, "load_features"): 83 | return self.extractor.transformer.load_features(cache_features) 84 | if not(input_split): data= batcher.split_batches(data, minibatch_size= minibatch_size) 85 | data= self.process(data, y=y, input_split=True, reset=reset, update=update) 86 | if self.extractor is not None: 87 | if self.verbose > 0: print("Extract features") 88 | if update: data= self.extractor.fit_transform(data, input_split=True, merge_output=True) 89 | else: data= self.extractor.transform(data, input_split=True, merge_output=True) 90 | if cache_features is not None and hasattr(self.extractor.transformer, "load_features"): 91 | self.extractor.transformer.save_features(cache_features, data) 92 | return data 93 | else: 94 | return batcher.merge_batches(data, batcher.backend) 95 | 96 | def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher= None): 97 | if batcher is None: batcher = self.batcher 98 | return self.fit(data, y, input_split, reset=False, minibatch_size= minibatch_size, batcher=batcher) 99 | 100 | def fit_transform(self, data, y=None, cache_features=None, input_split=False, reset=True, minibatch_size=None, 101 | batcher=None): 102 | if batcher is None: batcher = self.batcher 103 | return self.transform(data, y, cache_features, input_split, reset, update=True, minibatch_size= minibatch_size, 104 | batcher=batcher) 105 | 106 | def partial_fit_transform(self, data, y=None, cache_features=None, input_split=False, minibatch_size=None, 107 | batcher= None): 108 | if batcher is None: batcher = self.batcher 109 | return self.transform(data, y, cache_features, input_split, reset=False, update=True, 110 | minibatch_size= minibatch_size, batcher=batcher) 111 | 112 | def __getstate__(self): 113 | return dict((k, v) for (k, v) in self.__dict__.items()) 114 | 115 | def __setstate__(self, params): 116 | for key in params: setattr(self, key, params[key]) 117 | -------------------------------------------------------------------------------- /wordbatch/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dictionary import Dictionary 2 | from .tokenizer import Tokenizer 3 | from .text_normalizer import TextNormalizer 4 | -------------------------------------------------------------------------------- /wordbatch/transformers/dictionary.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | from collections import defaultdict 7 | import operator 8 | 9 | def batch_get_dfs(args): 10 | dft= defaultdict(int) 11 | for text in args[0]: 12 | if type(text)==str: 13 | for word in set(text.split(" ")): dft[word]+= 1 14 | else: 15 | dft[str(text)]+= 1 16 | return [dict(dft), len(args[0])] 17 | 18 | class Dictionary(object): 19 | def __init__(self, min_df=0, max_df=1.0, max_words= 10000000000000, freeze= False, encode=True, verbose=0): 20 | self.verbose = verbose 21 | self.freeze = freeze 22 | self.max_words = max_words 23 | self.min_df = min_df 24 | self.max_df = max_df 25 | self.encode= encode 26 | self.word2id= None 27 | self.reset() 28 | 29 | def reset(self): 30 | if self.encode: self.word2id = {} 31 | self.dft = {} 32 | self.doc_count = 0 33 | return self 34 | 35 | def get_pruning_dft(self, dft): 36 | sorted_dft = sorted(list(dft.items()), key=operator.itemgetter(1,0), reverse=True) 37 | if type(self.min_df) == type(1): min_df2 = self.min_df 38 | else: min_df2 = self.doc_count * self.min_df 39 | if type(self.max_df) == type(1): max_df2 = self.max_df 40 | else: max_df2 = self.doc_count * self.max_df 41 | return sorted_dft, min_df2, max_df2 42 | 43 | def prune_dictionary(self, max_words=None, min_df=None, max_df=None, re_encode= False, prune_dfs= True, 44 | set_max_words= True): 45 | #Prune dictionary. Optionally prune document frequency table as well 46 | if max_words is not None: self.max_words= max_words 47 | if min_df is not None: self.min_df= min_df 48 | if max_df is not None: self.max_df= max_df 49 | max_words= self.max_words 50 | word2id = self.word2id 51 | dft = self.dft 52 | sorted_dft, min_df2, max_df2 = self.get_pruning_dft(dft) 53 | c= 0 54 | #print(len(sorted_dft), len(self.word2id), len(self.raw_dft)) 55 | for word, df in sorted_dft: 56 | if word2id is not None: 57 | if word not in word2id: 58 | if re_encode: word2id[word]= -1 59 | else: continue 60 | c+= 1 61 | if c > max_words or df < min_df2 or df > max_df2: 62 | if prune_dfs: dft.pop(word) 63 | if word2id is not None: word2id.pop(word) 64 | elif re_encode: 65 | if word2id is not None: word2id[word]= c 66 | if set_max_words and word2id is not None: self.max_words= len(word2id) 67 | 68 | def fit(self, data, y=None, input_split= False, reset= False, minibatch_size=None, batcher= None): 69 | if reset: self.reset() 70 | if self.word2id is None: 71 | self.word2id = {} 72 | word2id= self.word2id 73 | if batcher is None: dfts, doc_counts= zip(*batch_get_dfs(data)) 74 | else: 75 | # import wordbatch.pipelines 76 | # dfts, doc_counts = zip(*batcher.collect_batches( 77 | # wordbatch.pipelines.apply_batch.ApplyBatch(get_dfs, batcher=batcher).transform( 78 | # data, input_split=input_split, merge_output=False) 79 | # )) 80 | dfts, doc_counts= zip(*batcher.collect_batches( 81 | batcher.process_batches(batch_get_dfs, data, [], input_split= input_split, merge_output=False, 82 | minibatch_size=minibatch_size))) 83 | self.doc_count += sum(doc_counts) 84 | dft = defaultdict(int, self.dft) 85 | for dft2 in dfts: 86 | for k, v in dft2.items(): dft[k] += v 87 | if word2id is not None: 88 | #Add entries. Online pruning only used to prevent inclusion into dictionary 89 | sorted_dft, min_df2, max_df2 = self.get_pruning_dft(dft) 90 | for word, df in sorted_dft: 91 | if len(word2id)>= self.max_words: break 92 | if dfmax_df2: continue 93 | if word in word2id: continue 94 | word2id[word] = len(word2id)+1 95 | if self.verbose>2: print("Add word to dictionary:", word, dft[word], word2id[word]) 96 | self.dft= dict(dft) 97 | return self 98 | 99 | def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher=None): 100 | return self.fit(data, y, input_split, reset=False, minibatch_size=minibatch_size, batcher=batcher) 101 | 102 | def fit_transform(self, data, y=None, input_split= False, merge_output= True, reset= True, minibatch_size=None, 103 | batcher= None): 104 | self.fit(data, y=y, input_split= input_split, reset=reset, minibatch_size= minibatch_size, batcher=batcher) 105 | return self.transform(data, y=y, input_split= input_split, merge_output= merge_output, batcher= None) 106 | 107 | def partial_fit_transform(self, data, y=None, input_split=False, minibatch_size=None, 108 | batcher=None): 109 | return self.transform(data, y, input_split, reset=False, update=True, batcher=batcher) 110 | 111 | def transform(self, data, y=None, input_split= False, merge_output= True, minibatch_size=None, 112 | batcher= None): 113 | if input_split and merge_output and batcher is not None: data= batcher.merge_batches(data) 114 | return data -------------------------------------------------------------------------------- /wordbatch/transformers/text_normalizer.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | import re 7 | import wordbatch.batcher 8 | import wordbatch.pipelines 9 | import wordbatch.transformers 10 | 11 | def batch_transform(args): 12 | return args[1].batch_transform(args[0]) 13 | 14 | non_alphanums= re.compile(u'[^A-Za-z0-9]+') 15 | def default_normalize_text(text): 16 | return u" ".join([x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] if len(x)>1]) 17 | 18 | class TextNormalizer(object): 19 | def __init__(self, normalize_text= default_normalize_text, freeze= False, verbose= 1): 20 | self.verbose= verbose 21 | self.freeze= freeze 22 | self.non_alphanums = re.compile(u'[^A-Za-z0-9]+') 23 | self.normalize_text= normalize_text 24 | self.reset() 25 | 26 | def reset(self): 27 | return self 28 | 29 | def batch_transform(self, data): return [self.normalize_text(text) for text in data] 30 | 31 | def transform(self, data, input_split=False, merge_output=True, minibatch_size= None, batcher=None): 32 | if batcher is None: batcher = wordbatch.batcher.Batcher() 33 | return batcher.process_batches(batch_transform, data, [self], input_split=input_split, 34 | merge_output=merge_output, minibatch_size= minibatch_size) 35 | 36 | def fit(self, data, y=None, input_split=False, merge_output=True, minibatch_size= None, batcher=None): 37 | return self 38 | 39 | def fit_transform(self, data, y=None, input_split=False, merge_output=True, 40 | minibatch_size= None, batcher=None): 41 | return self.transform(data, input_split, merge_output, minibatch_size, batcher) -------------------------------------------------------------------------------- /wordbatch/transformers/tokenizer.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from __future__ import with_statement 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | #from nltk.metrics import edit_distance 7 | import Levenshtein #python-Levenshtein 8 | from collections import defaultdict 9 | 10 | def batch_get_dfs(args): 11 | dft= defaultdict(int) 12 | for text in args[0]: 13 | for word in set(text.split(" ")): dft[word]+= 1 14 | return [dict(dft), len(args[0])] 15 | 16 | def correct_spelling(word, dft, corrections_index, spellcor_count, spellcor_dist): 17 | #T. Bocek, E. Hunt, B. Stiller: Fast Similarity Search in Large Dictionaries, 2007 18 | if dft.get(word, 0)>spellcor_count or len(word)<3: return word 19 | max_df= -100000000000000 20 | max_word= word 21 | spell_suggestions= get_deletions(word, spellcor_dist) 22 | candidates= {} 23 | for x in spell_suggestions: 24 | if x in corrections_index: 25 | for y in corrections_index[x]: candidates[y]= 1 26 | #for word2 in list(candidates.keys()): 27 | for word2 in candidates: 28 | #score= edit_distance(word, word2, True) 29 | score= Levenshtein.distance(word, word2) 30 | if score>spellcor_dist: continue 31 | #score = float(dft[word2]) / score 32 | score= dft[word2] 33 | #score = Levenshtein.jaro_winkler(word, word2) 34 | #score= dft[word2]*Levenshtein.jaro_winkler(word, word2) 35 | if score > max_df: 36 | max_df= score 37 | max_word= word2 38 | return max_word 39 | 40 | def batch_correct_spellings(args): 41 | corrs= args[1] 42 | return [u" ".join([corrs.get(word, word) for word in text.split(" ")]) for text in args[0]] 43 | 44 | def get_deletions(word, order): 45 | stack = {word: order} 46 | results = {} 47 | while len(stack) > 0: 48 | stack2 = {} 49 | for word2 in stack: 50 | order2 = stack[word2] - 1 51 | for x in range(len(word2)): 52 | if order2 != 0: stack2[word2[:x] + word2[x + 1:]] = order2 53 | results[word2[:x] + word2[x + 1:]] = 1 54 | stack = stack2 55 | return list(results.keys()) 56 | 57 | def make_corrections_index(dft, spellcor_count, spellcor_dist): 58 | dft2 = {w[0]: w[1] for w in dft.items() if w[1] > spellcor_count} 59 | corrections_index = defaultdict(list) 60 | for word in dft2: 61 | if len(word) > 15: continue 62 | for word2 in get_deletions(word, spellcor_dist): 63 | corrections_index[word2].append(word) 64 | return corrections_index 65 | 66 | class Tokenizer(object): 67 | def __init__(self, spellcor_count=0, spellcor_dist=2, stemmer= None, freeze= False, verbose= 0): 68 | self.verbose= verbose 69 | self.freeze= freeze 70 | if spellcor_count == 0: 71 | spellcor_dist = 0 72 | elif spellcor_dist == 0: 73 | spellcor_count = 0 74 | self.spellcor_count = spellcor_count 75 | self.spellcor_dist = spellcor_dist 76 | self.stemmer = stemmer 77 | self.reset() 78 | 79 | def reset(self): 80 | self.dft = {} 81 | self.doc_count = 0 82 | return self 83 | 84 | def fit(self, data, y= None, input_split= False, reset= True, minibatch_size=None, batcher= None): 85 | if reset: self.reset() 86 | if self.freeze: return self 87 | if batcher is None: dfts, doc_counts= zip(*batch_get_dfs(data)) 88 | else: 89 | dfts, doc_counts= zip(*batcher.collect_batches( 90 | batcher.process_batches(batch_get_dfs, data, [], input_split= input_split, merge_output=False, 91 | minibatch_size=minibatch_size))) 92 | self.doc_count += sum(doc_counts) 93 | dft = defaultdict(int, self.dft) 94 | for dft2 in dfts: 95 | for k, v in dft2.items(): dft[k] += v 96 | self.dft= dict(dft) 97 | return self 98 | 99 | def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher=None): 100 | return self.fit(data, y, input_split, reset=False, minibatch_size= minibatch_size, batcher=batcher) 101 | 102 | def fit_transform(self, data, y=None, input_split= False, merge_output= True, reset= True, 103 | minibatch_size=None, batcher= None): 104 | self.fit(data, y=y, input_split= input_split, reset=reset, minibatch_size= minibatch_size, batcher=batcher) 105 | return self.transform(data, y=y, input_split=input_split, merge_output=merge_output, 106 | minibatch_size=minibatch_size, batcher=batcher) 107 | 108 | def partial_fit_transform(self, data, y=None, input_split=False, minibatch_size=None, batcher=None): 109 | return self.transform(data, y, input_split, reset=False, update=True, minibatch_size=minibatch_size, 110 | batcher=batcher) 111 | 112 | def transform(self, X, y=None, input_split= False, merge_output= True, minibatch_size=None, batcher= None): 113 | if self.verbose > 0: print("Make word normalization dictionary") 114 | do_corrections= 1 if (self.spellcor_count > 0) and (self.spellcor_dist>0) else 0 115 | if not(do_corrections) and self.stemmer is None: return X 116 | if do_corrections: 117 | corrections_index= make_corrections_index(self.dft, self.spellcor_count, self.spellcor_dist) 118 | if self.stemmer is not None: 119 | if do_corrections: 120 | corrs = {word: self.stemmer.stem(correct_spelling( 121 | word, self.dft, corrections_index, self.spellcor_count, self.spellcor_dist)) for word in self.dft} 122 | else: corrs = {word: self.stemmer.stem(word) for word in self.dft} 123 | elif do_corrections: 124 | corrs = {word: correct_spelling( 125 | word, self.dft, corrections_index, self.spellcor_count, self.spellcor_dist) for word in self.dft} 126 | corrs = {key: value for key, value in corrs.items() if key != value} 127 | if self.verbose > 0: print("Make word normalizations") 128 | if batcher is None: return batch_correct_spellings(X) 129 | return batcher.process_batches(batch_correct_spellings, X, [corrs], input_split=input_split, 130 | merge_output=merge_output, minibatch_size= minibatch_size) 131 | --------------------------------------------------------------------------------