├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.rst
├── conda
    └── environments
    │   └── wordbatch_dev.yml
├── data
    └── Tweets.csv
├── scripts
    ├── backends_benchmark.py
    ├── classify_airline_sentiment.py
    ├── decorator_test.py
    ├── wordbag_regressor.py
    ├── wordbag_regressor_spark.py
    ├── wordhash_regressor.py
    ├── wordseq_regressor.py
    └── wordvec_regressor.py
├── setup.cfg
├── setup.py
└── wordbatch
    ├── __init__.py
    ├── batcher.py
    ├── data_utils.py
    ├── extractors
        ├── MurmurHash3.cpp
        ├── MurmurHash3.h
        ├── __init__.py
        └── extractors.pyx
    ├── models
        ├── __init__.py
        ├── avx_ext.c
        ├── avx_ext.h
        ├── fm_ftrl.pyx
        ├── ftrl.pyx
        ├── ftrl32.pyx
        ├── nn_relu_h1.pyx
        └── nn_relu_h2.pyx
    ├── pipelines
        ├── __init__.py
        ├── apply.py
        ├── apply_batch.py
        ├── apply_groupby.py
        ├── batch_transformer.py
        ├── feature_union.py
        └── wordbatch.py
    └── transformers
        ├── __init__.py
        ├── dictionary.py
        ├── text_normalizer.py
        └── tokenizer.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | scripts/
3 | .git
4 | .gitignore
5 | Dockerfile*
6 | Makefile
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # Other 
127 | data/tripadvisor/json/*
128 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:4.7.12 as builder
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | ENV PATH /opt/conda/bin:$PATH
 6 | ENV CONDA_PREFIX /opt/conda
 7 | 
 8 | COPY conda/environments/wordbatch_dev.yml /app
 9 | 
10 | RUN apt-get --allow-releaseinfo-change update && \
11 |     apt-get install -y --no-install-recommends build-essential gcc && \
12 |     conda update -n base conda && \
13 |     conda env update -f /app/wordbatch_dev.yml
14 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE.txt
 2 | 
 3 | include wordbatch/__init__.py
 4 | include wordbatch/wordbatch.py
 5 | 
 6 | include wordbatch/extractors/__init__.py
 7 | include wordbatch/extractors/extractors.pyx
 8 | include wordbatch/extractors/MurmurHash3.cpp
 9 | include wordbatch/extractors/MurmurHash3.h
10 | 
11 | include wordbatch/models/__init__.py
12 | include wordbatch/models/ftrl.pyx
13 | include wordbatch/models/fm_ftrl.pyx
14 | include wordbatch/models/nn_relu_h1.pyx
15 | include wordbatch/models/nn_relu_h2.pyx
16 | include wordbatch/models/avx_ext.c
17 | include wordbatch/models/avx_ext.h
18 | 
19 | include scripts/wordbag_regressor.py
20 | include scripts/wordhash_regressor.py
21 | include scripts/wordseq_regressor.py
22 | include scripts/wordvec_regressor.py
23 | include scripts/classify_airline_sentiment.py
24 | include scripts/backends_benchmark.py
25 | 
26 | include data/Tweets.csv


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGE_NAME=wordbatch
 2 | CONTAINER_NAME=wordbatch_dev
 3 | 
 4 | build: ## Build the image
 5 | 	docker build -t $(IMAGE_NAME) .
 6 | 
 7 | run-dev: ## Run container for development
 8 | 	docker run \
 9 | 		-it \
10 | 		--name=$(CONTAINER_NAME) \
11 | 		-v $(shell pwd):/wordbatch $(IMAGE_NAME) bash
12 | 
13 | attach: ## Run a bash in a running container
14 | 	docker start $(CONTAINER_NAME) && docker attach $(CONTAINER_NAME)
15 | 
16 | stop: ## Stop and remove a running container
17 | 	docker stop $(CONTAINER_NAME); docker rm $(CONTAINER_NAME)
18 | 
19 | test:
20 | 	docker start $(CONTAINER_NAME)
21 | 	docker exec -it $(CONTAINER_NAME)  env | grep PATH
22 | 	docker exec -it $(CONTAINER_NAME)  which python
23 | 	docker exec -it $(CONTAINER_NAME) python -c '\
24 | 	import wordbatch;\
25 | 	from wordbatch import models;\
26 | 	print(wordbatch.__version__)'
27 | 	#docker exec -it $(CONTAINER_NAME) python -c 'print("hello")'
28 | 	#docker exec -it $(CONTAINER_NAME) echo "Hello from container!"


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===============
  2 | Wordbatch 1.4.9
  3 | ===============
  4 | 
  5 | Overview
  6 | ========
  7 | 
  8 | Python library for distributed AI processing pipelines, using swappable scheduler backends.
  9 | 
 10 | Wordbatch parallelizes task pipelines as minibatches processed by a chosen scheduler backend. This allows
 11 | the user to develop AI programs on a local workstation or laptop, and scale the same
 12 | solution on a cluster or the cloud, simply by changing the pipeline backend to a distributed scheduler such as Spark,
 13 | Dask or Ray. A backend can be chosen based on performance characteristics on a particular task, and swapped for
 14 | different situations. For example, an AI model can be trained using a distributed backend, and then debugged or
 15 | deployed using a single serial process.
 16 | 
 17 | The library is organized around the orchestrator class Batcher, and Sklearn-compatible components,
 18 | split into Pipelines, Transformers, Extractors and Models. These extend the Scikit-learn API with a
 19 | fit_partial()-method, that enables transformers and models to be used in a streaming fashion.
 20 | The current set of components has been developed mostly for text processing tasks, but components for other domains
 21 | can be developed based on the available classes.
 22 | 
 23 | Requirements
 24 | ============
 25 | Linux / Windows / macOS. Python 3.6 / 3.7
 26 | 
 27 | Installation
 28 | ============
 29 | pip install wordbatch
 30 | 
 31 | macOS: compile using GCC-7 (https://github.com/anttttti/Wordbatch/issues/1)
 32 | 
 33 | linux: make sure GCC and its required libraries are installed before installing Wordbatch
 34 | | sudo apt install gcc
 35 | | sudo apt-get update
 36 | | sudo apt-get install --reinstall build-essential
 37 | 
 38 | Getting started
 39 | ===============
 40 | 
 41 | | from wordbatch.models import FTRL
 42 | | from wordbatch.extractors import WordBag
 43 | | from wordbatch.pipelines import WordBatch
 44 | | from wordbatch.batcher import Batcher
 45 | |
 46 | | wb= WordBatch(extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0),
 47 | |               batcher=Batcher(backend="multiprocessing"))
 48 | |
 49 | | clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1)
 50 | |
 51 | | train_texts= ["Cut down a tree with a herring? It can't be done.",
 52 | |              "Don't say that word.",
 53 | |              "How can we not say the word if you don't tell us what it is?"]
 54 | | train_labels= [1, 0, 1]
 55 | | test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"]
 56 | |
 57 | | clf.fit(wb.fit_transform(train_texts), train_labels)
 58 | | print(clf.predict(wb.transform(test_texts)))
 59 | |
 60 | | import ray
 61 | | ray.init()
 62 | | wb.batcher.backend= "ray"
 63 | | wb.batcher.backend_handle= ray
 64 | |
 65 | | clf.fit(wb.fit_transform(train_texts), train_labels)
 66 | | print(clf.predict(wb.transform(test_texts)))
 67 | 
 68 | 
 69 | Components
 70 | ==========
 71 | 
 72 | Batcher
 73 | -------
 74 | Batcher orchestrates MapReduce processing of tasks using a backend, by splitting input data into separately processed
 75 | minibatches. Currently three local (serial, multiprocessing, Loky) and three distributed backends (Spark, Dask,
 76 | Ray) are supported. Some distributed backends will process the tasks concurrently as a graph of lazily evaluated
 77 | futures, with Batcher dynamically sending the graph for the backend to process. All three supported distributed
 78 | backends allow real time monitoring of the processing pipeline using the backend's own GUI.
 79 | 
 80 | 
 81 | Pipelines
 82 | ---------
 83 | Pipelines are classes that send functions, methods and classes to Batcher for processing. Unlike other components in
 84 | Wordbatch, pipelines contain a reference to Batcher, and are never referenced themselves in the calls sent to Batcher.
 85 | This prevents trying to serialize and send the backend handle itself. The simplest pipeline is Apply,
 86 | which processes a function or method over the input data row-by-row. WordBatch is a full complex pipeline for text
 87 | processing, with optional steps such as text normalization, spelling correction, stemming, feature extraction, and
 88 | LZ4-caching of results.
 89 | 
 90 | 
 91 | Transformers
 92 | ------------
 93 | Transformers are transformer classes extending the Scikit-learn API, by accepting a Batcher instance as argument
 94 | of fit and transform methods. Transformers won't store Batcher references, allowing the transformer objects to be sent
 95 | to distributed workers. This allows transformers to do MapReduce operations as part of its methods, for example
 96 | gathering a dictionary of words from data when fitting a Dictionary. The current set of transformers are
 97 | text-specific classes, such as Dictionary, Tokenizer and TextNormalizer.
 98 | 
 99 | 
100 | Extractors
101 | ----------
102 | Extractors are transformer classes which don't directly call Batcher. Since extractors can't call Batcher directly,
103 | they are mostly immutable and used for their transform() method calls distributed using a pipeline. The current set of
104 | extractors is Cython-optimized, and aside from PandasHash intended for text feature extraction. These are:
105 | 
106 | - WordHash is wrapper for Scikit-learn HashingVectorizer, extended with option for LZ4-caching
107 | - WordBag is a flexible alternative to Wordhash, with options such as IDF and per n-gram order weighting
108 | - WordSeq provides sequences of word integers, as used by deep learning language models
109 | - WordVec embeds words into word vector representations
110 | - PandasHash extracts hashed features from a Pandas DataFrame, similar to VowpalWabbit's feature extraction
111 | 
112 | 
113 | Models
114 | ------
115 | Models are predictive models such as classifiers. Similar to extractors, they don't directly call Batcher, but are
116 | Scikit-learn compatible and distributed using a pipeline if needed. Currently four
117 | OpenMP-multithreaded L1&L2-regularized online learning models are provided, for single-label regression and
118 | classification:
119 | 
120 | - FTRL : Linear model Proximal-FTRL that has become the most popular algorithm for online learning of linear models in Kaggle competions. The Cython-optimized implementation should be the fastest available version of FTRL.
121 | - FM_FTRL : Factorization Machines. Linear effects estimated with FTRL and factor effects estimated with adaptive SGD. Prediction and estimation multithreaded across factors.
122 | - NN_Relu_H1 : Neural Network with 1 hidden layer and Rectified Linear Unit activations, estimated with adaptive SGD. Prediction and estimation multithreaded across hidden layer.
123 | - NN_Relu_H2: Neural Network with 2 hidden layers and Rectified Linear Unit activations, estimated with adaptive SGD. Prediction multithreaded across 2nd hidden layer, estimation across 1st hidden layer outputs.
124 | 
125 | The adaptive SGD optimizer works like Adagrad, but pools the adaptive learning rates across hidden nodes using the same
126 | feature. This makes learning more robust and requires less memory. FM_FTRL uses AVX2-optimization, so that processors
127 | supporting AVX2 will run the factorization model up to four times faster.
128 | 
129 | Example scripts
130 | ===============
131 | 
132 | The directory /scripts/ contains scripts for demonstrating and testing basic uses of the toolkit. To run the scripts
133 | one should first install the dependencies: Keras, NLTK, TextBlob, Pandas, Ray, Dask Distributed and PySpark.
134 | The scripts also use the TripAdvisor dataset (http://times.cs.uiuc.edu/~wang296/Data/), and the
135 | precomputed word embeddings glove.twitter.27B.100d and glove.6B.50d (http://nlp.stanford.edu/projects/glove/). Test
136 | data from Crowdflower Open data & Kaggle is provided in the /data directory.
137 | 
138 | Airline Classification Example
139 | ------------------------------
140 | classify_airline_sentiment.py shows training and combining predictions with four classifier scripts that use the
141 | Wordbatch extractors and models: wordhash_regressor.py, wordbag_regressor.py, wordseq_regressor.py and
142 | wordvec_regressor.py. The header part of the script can be modified to choose the backend. By default Ray is used and
143 | passed to the other scripts.
144 | 
145 | Backends Benchmark Example
146 | --------------------------
147 | backends_benchmark.py shows how to benchmark different backends on two simple pipeline tasks:
148 | using ApplyBatch with Scikit-learn HashingVectorizer, and running WordBatch Pipeline with most of its possible
149 | processing steps. Dask and Spark are commented out by default, as these need command-line configuration.
150 | All three distributed backends can be configured to run across a distributed cluster, as done in the
151 | commented-out code.
152 | 
153 | 
154 | Contributors
155 | ============
156 | Antti Puurula
157 | 
158 | Anders Topper
159 | 
160 | Cheng-Tsung Liu
161 | 


--------------------------------------------------------------------------------
/conda/environments/wordbatch_dev.yml:
--------------------------------------------------------------------------------
 1 | name: base
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python>=3.7.3
 7 |   - numpy==1.20.3
 8 |   - cython==0.29.10
 9 |   - pip
10 | #  - nltk>=3.4.3
11 | #  - textblob
12 |   - pip:
13 |     - wordbatch==1.4.8
14 | #  - keras==2.3.1
15 | #  - pyspark==2.4.0
16 | #  - dask>=2.1.0
17 | #  - distributed>=2.1.0
18 | prefix: /opt/conda


--------------------------------------------------------------------------------
/scripts/backends_benchmark.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from contextlib import closing, contextmanager
  3 | import time
  4 | from wordbatch.pipelines import WordBatch, Apply, ApplyBatch
  5 | from wordbatch.extractors import WordHash, WordBag
  6 | from wordbatch.transformers import Tokenizer, Dictionary
  7 | from wordbatch.batcher import Batcher
  8 | import os
  9 | import json
 10 | from sklearn.feature_extraction.text import HashingVectorizer
 11 | import warnings
 12 | import pandas as pd
 13 | import multiprocessing
 14 | 
 15 | # http://sifaka.cs.uiuc.edu/~wang296/Data/LARA/TripAdvisor/
 16 | tripadvisor_dir= "../data/tripadvisor/json"
 17 | 
 18 | # Configure below to allow Dask / Spark
 19 | # scheduler_ip= "169.254.93.14"
 20 | # from dask.distributed import Client
 21 | # #dask-scheduler --host 169.254.93.14
 22 | # #dask-worker 169.254.93.14:8786 --nprocs 16
 23 | # dask_client = Client(scheduler_ip+":8786")
 24 | #
 25 | # from pyspark import SparkContext, SparkConf
 26 | # # conf= SparkConf().setAll([('spark.executor.memory', '4g'), ('spark.driver.memory', '30g'),
 27 | # # 						  ('spark.driver.maxResultSize', '10g')])
 28 | # import os
 29 | # os.environ['PYSPARK_PYTHON'] = '/home/USERNAME/anaconda3/envs/ENV_NAME/bin/python'
 30 | # conf= SparkConf().setAll([('spark.executor.memory', '4g'), ('spark.driver.memory', '30g'),
 31 | # 						  ('spark.driver.maxResultSize', '10g')]).setMaster("spark://169.254.93.14:7077")
 32 | # spark_context = SparkContext(conf=conf)
 33 | 
 34 | import ray
 35 | #ray start --head --node-ip-address 169.254.93.14
 36 | #ray.init(redis_address=scheduler_ip+":57113") #Change port accordingly
 37 | ray.init()
 38 | 
 39 | @contextmanager
 40 | def timer(name):
 41 | 	t0 = time.time()
 42 | 	yield
 43 | 	print(name + " done in " + str(time.time() - t0) + "s")
 44 | 
 45 | if 1==1:
 46 | 	texts= []
 47 | 	for jsonfile in os.listdir(tripadvisor_dir):
 48 | 		with open(tripadvisor_dir + "/" + jsonfile, 'r') as inputfile:
 49 | 			for line in inputfile:
 50 | 				try:
 51 | 					line = json.loads(line.strip())
 52 | 				except:
 53 | 					continue
 54 | 				for review in line["Reviews"]:
 55 | 					texts.append(review["Content"])
 56 | # 	pd.to_pickle(texts, "tripadvisor_data.pkl")
 57 | # else:
 58 | # 	texts= pd.read_pickle("tripadvisor_data.pkl")
 59 | 
 60 | non_alphanums = re.compile('[\W+]')
 61 | nums_re= re.compile("\W*[0-9]+\W*")
 62 | triples_re= re.compile(r"(\w)\1{2,}")
 63 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"),
 64 | 		   re.compile(" '"),re.compile("  +")]
 65 | from nltk.stem.porter import PorterStemmer
 66 | stemmer= PorterStemmer()
 67 | 
 68 | def normalize_text(text):
 69 | 	text= text.lower()
 70 | 	text= nums_re.sub(" NUM ", text)
 71 | 	text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1])
 72 | 	return text
 73 | 
 74 | print(len(texts))
 75 | backends= [
 76 | 	['serial', ""],
 77 | 	['multiprocessing', ""],
 78 | 	['loky', ""],
 79 | 	# ['dask', dask_client], # Uncomment once configured
 80 | 	# ['spark', spark_context], # Uncomment once configured
 81 | 	['ray', ray]
 82 | ]
 83 | 
 84 | tasks= [
 85 | 	"ApplyBatch",
 86 | 	"WordBag",
 87 | ]
 88 | 
 89 | data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000]
 90 | 
 91 | for task in tasks:
 92 | 	for data_size in data_sizes:
 93 | 		texts_chunk = texts[:data_size]
 94 | 		print("Task:", task, "Data size:", data_size)
 95 | 		for backend in backends:
 96 | 			batcher = Batcher(procs=multiprocessing.cpu_count(), minibatch_size=5000, backend=backend[0], backend_handle=backend[1])
 97 | 			try:
 98 | 				with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings():
 99 | 					warnings.simplefilter("ignore")
100 | 					if task=="ApplyBatch":
101 | 						hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text,
102 | 											   ngram_range=(1, 2), norm='l2')
103 | 						t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk)
104 | 						print(t.shape, t.data[:5])
105 | 
106 | 					if task=="WordBag":
107 | 						wb = WordBatch(normalize_text=normalize_text,
108 | 									   dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0),
109 | 									   tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer),
110 | 									   extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0),
111 | 									   batcher= batcher,
112 | 									   verbose= 0)
113 | 						t = wb.fit_transform(texts_chunk)
114 | 						print(t.shape, t.data[:5])
115 | 			except:
116 | 				print("Failed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]")
117 | 		print("")


--------------------------------------------------------------------------------
/scripts/classify_airline_sentiment.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sp
  4 | import re
  5 | import sklearn
  6 | from sklearn.model_selection import *
  7 | from sklearn.ensemble import RandomForestRegressor
  8 | import textblob
  9 | from math import *
 10 | import time, datetime
 11 | import multiprocessing
 12 | from wordbatch.batcher import Batcher
 13 | import wordbatch
 14 | 
 15 | print("Wordbatch version:", wordbatch.__version__)
 16 | pd.set_option('display.max_rows', 500)
 17 | pd.set_option('display.max_columns', 500)
 18 | pd.set_option('display.max_colwidth', 1000)
 19 | 
 20 | backend= "ray"
 21 | #backend= "multiprocessing"
 22 | minibatch_size= 10000
 23 | if backend == "ray":
 24 | 	import ray
 25 | 	ray.init()
 26 | 	b = Batcher(backend="ray", backend_handle=ray, minibatch_size=minibatch_size)
 27 | if backend == "multiprocessing":
 28 | 	b = Batcher(backend="multiprocessing", minibatch_size=minibatch_size)
 29 | 
 30 | # http://sifaka.cs.uiuc.edu/~wang296/Data/LARA/TripAdvisor/
 31 | tripadvisor_dir= "../data/tripadvisor/json"
 32 | if __name__ == "__main__":
 33 | 	start_time= time.time()
 34 | 	print(datetime.datetime.now())
 35 | 
 36 | 	#df= pd.DataFrame.from_csv("../data/Tweets.csv", encoding="utf8")
 37 | 	df = pd.read_csv("../data/Tweets.csv", encoding="utf8")
 38 | 	def sentiment_to_label(sentiment):
 39 | 		if sentiment=="neutral":  return 0
 40 | 		if sentiment=="negative":  return -1
 41 | 		return 1
 42 | 	d_sentiment_to_label= {"neutral":0, "negative":-1, "positive":1}
 43 | 	df['airline_sentiment_confidence']= df['airline_sentiment_confidence'].astype('str')
 44 | 	df['sentiment'] = (df['airline_sentiment']).map(d_sentiment_to_label)
 45 | 	df= df[['text','sentiment']]
 46 | 
 47 | 	re_attags= re.compile(" @[^ ]* ")
 48 | 	re_spaces= re.compile("\w+]")
 49 | 	df['text']= df['text'].apply(lambda x: re_spaces.sub(" ",re_attags.sub(" ", " "+x+" "))[1:-1])
 50 | 	df= df.drop_duplicates(subset=['text'])
 51 | 	df.index= df['id']= range(df.shape[0])
 52 | 
 53 | 	non_alphanums=re.compile('[^A-Za-z]+')
 54 | 	def normalize_text(text): return non_alphanums.sub(' ', text).lower().strip()
 55 | 	df['text_normalized']= df['text'].map(lambda x: normalize_text(x))
 56 | 	df['textblob_score']= df['text_normalized'].map(lambda x: textblob.TextBlob(x).polarity)
 57 | 
 58 | 	import wordbag_regressor
 59 | 	print("Train wordbag regressor")
 60 | 	wb_regressor= wordbag_regressor.WordbagRegressor("../models/wordbag_model.pkl.gz", tripadvisor_dir, b)
 61 | 	#wb_regressor= wordbag_regressor.WordbagRegressor("../models/wordbag_model.pkl.gz")
 62 | 	df['wordbag_score']= wb_regressor.predict(df['text'].values)
 63 | 	print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2)))
 64 | 
 65 | 	import wordhash_regressor
 66 | 	print("Train wordhash regressor")
 67 | 	wh_regressor= wordhash_regressor.WordhashRegressor("../models/wordhash_model.pkl.gz", tripadvisor_dir, b)
 68 | 	#wh_regressor= wordhash_regressor.WordhashRegressor("../models/wordhash_model.pkl.gz")
 69 | 	df['wordhash_score']= wh_regressor.predict(df['text'].values)
 70 | 	print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2)))
 71 | 
 72 | 	import wordseq_regressor
 73 | 	print("Train wordseq regressor")
 74 | 	ws_regressor = wordseq_regressor.WordseqRegressor("../models/wordseq_model.pkl.gz", tripadvisor_dir, b)
 75 | 	#ws_regressor = wordseq_regressor.WordseqRegressor("../models/wordseq_model.pkl.gz")
 76 | 	df['wordseq_score']= ws_regressor.predict_batch(df['text'].values)
 77 | 	print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2)))
 78 | 
 79 | 	import wordvec_regressor
 80 | 	print("Train wordvec regressor")
 81 | 	wv_regressor= wordvec_regressor.WordvecRegressor("../models/wordvec_model.pkl.gz", tripadvisor_dir, b)
 82 | 	#wv_regressor= wordvec_regressor.WordvecRegressor("../models/wordvec_model.pkl.gz")
 83 | 	df['wordvec_score'] = wv_regressor.predict(df['text'].values)
 84 | 	print(df['wordvec_score'])
 85 | 	print(("%s minutes ---" % round(((time.time() - start_time) / 60), 2)))
 86 | 
 87 | 	df['tweet_len']= df['text'].map(lambda x: log(1+len(x)))
 88 | 	df['tweet_wordcount']= df['text'].map(lambda x: log(1+len(x.split())))
 89 | 
 90 | 	print(df)
 91 | 	full_preds= np.zeros(df.shape[0])
 92 | 	columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] #Mean Squared Error: 0.28730889581
 93 | 	#columns_pick= ['tweet_len', 'tweet_wordcount', 'wordhash_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] #
 94 | 	#columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordseq_score', 'wordvec_score', 'textblob_score'] #
 95 | 	#columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordvec_score', 'textblob_score'] #
 96 | 	#columns_pick= ['tweet_len', 'tweet_wordcount', 'wordbag_score', 'wordhash_score', 'wordseq_score', 'textblob_score'] #
 97 | 
 98 | 	kf= KFold(n_splits=10, shuffle=True, random_state=0)
 99 | 	for train_index, dev_index in kf.split(range(df.shape[0])):
100 | 		df_train= df.iloc[train_index]
101 | 		df_dev= df.iloc[dev_index]
102 | 		clf= RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=None, min_samples_split=5,
103 | 								   min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto',
104 | 								   max_leaf_nodes=None, bootstrap=True, oob_score=False,
105 | 								   n_jobs=multiprocessing.cpu_count(), random_state=0,
106 | 								   verbose=0, warm_start=False)
107 | 
108 | 		clf.fit(df_train[columns_pick], df_train['sentiment'])
109 | 		preds= clf.predict(df_dev[columns_pick])
110 | 		for x in range(len(preds)):  full_preds[df_dev['id'].iloc[x]]= preds[x]
111 | 
112 | 	df['preds']= sp.clip(full_preds, -1.0, 1.0)
113 | 
114 | 	print(datetime.datetime.now())
115 | 	print(("%s minutes ---" % round(((time.time() - start_time)/60),2)))
116 | 
117 | 	c_mse= sklearn.metrics.mean_squared_error(df['sentiment'], df['preds'], sample_weight=None,
118 | 											  multioutput='uniform_average')
119 | 	print("Mean Squared Error:", c_mse)
120 | 


--------------------------------------------------------------------------------
/scripts/decorator_test.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from contextlib import contextmanager
  3 | import time
  4 | from wordbatch.pipelines import decorator_apply as apply
  5 | from wordbatch.batcher import Batcher
  6 | import warnings
  7 | import pandas as pd
  8 | from nltk.stem.porter import PorterStemmer
  9 | from numba import int64, float64
 10 | import os
 11 | import json
 12 | 
 13 | tripadvisor_dir= "../data/tripadvisor/json"
 14 | 
 15 | import ray
 16 | #ray start --head --node-ip-address 169.254.93.14
 17 | #ray.init(redis_address=scheduler_ip+":57113") #Change port accordingly
 18 | ray.init()
 19 | 
 20 | @contextmanager
 21 | def timer(name):
 22 | 	t0 = time.time()
 23 | 	yield
 24 | 	print(name + " done in " + str(time.time() - t0) + "s")
 25 | 
 26 | if 1==1:
 27 | 	texts= []
 28 | 	for jsonfile in os.listdir(tripadvisor_dir):
 29 | 		with open(tripadvisor_dir + "/" + jsonfile, 'r') as inputfile:
 30 | 			for line in inputfile:
 31 | 				try:
 32 | 					line = json.loads(line.strip())
 33 | 				except:
 34 | 					continue
 35 | 				for review in line["Reviews"]:
 36 | 					texts.append(review["Content"])
 37 | # 	pd.to_pickle(texts, "tripadvisor_data.pkl")
 38 | # else:
 39 | # 	texts= pd.read_pickle("tripadvisor_data.pkl")
 40 | 
 41 | non_alphanums = re.compile('[\W+]')
 42 | stemmer= PorterStemmer()
 43 | 
 44 | def normalize_text(text):
 45 | 	text= " ".join([word for word in non_alphanums.sub(" ",text.lower()).strip().split() if len(word)>1])
 46 | 	return text
 47 | 
 48 | print(len(texts))
 49 | backends= [
 50 | 	###['multiprocessing', ""], #doesn't serialize lambda functions
 51 | 	['ray', ray],
 52 | 	['loky', ""],
 53 | 	['serial', ""],
 54 | ]
 55 | 
 56 | #data_size= 200000
 57 | #data_size= 500000
 58 | data_size= 1280000
 59 | 
 60 | def test_backend(texts, backend):
 61 | 	df = pd.DataFrame(texts, columns=['text'])
 62 | 	df['text']= df['text'].fillna("")
 63 | 	batcher = Batcher(minibatch_size=5000, backend=backend[0], backend_handle=backend[1])
 64 | 	#batcher = Batcher(minibatch_size=data_size//8, backend=backend[0], backend_handle=backend[1])
 65 | 	if backend[0]=="ray":
 66 | 		backend[1].shutdown()
 67 | 		backend[1].init()
 68 | 
 69 | 	try:
 70 | 		with timer("Text normalization: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings():
 71 | 			warnings.simplefilter("ignore")
 72 | 			df['text_normalized'] = apply(normalize_text, batcher)(df['text'])
 73 | 		with timer("Text normalization without Wordbatch: " + str(len(df)) + "," + backend[0]) \
 74 | 				, warnings.catch_warnings():
 75 | 			warnings.simplefilter("ignore")
 76 | 			df['text_normalized'] = [normalize_text(x) for x in df['text']]
 77 | 	except Exception as e:
 78 | 		print("Failed text normalization: " +"," + str(len(df)) + "," + backend[0])
 79 | 	# #"Exception:", e.split("\n")[0])
 80 | 
 81 | 	try:
 82 | 		def div(x, y):
 83 | 			return 0 if y==0 else x / y
 84 | 		df['len_text'] = df['text'].str.len().astype(int)
 85 | 		df['len_text_normalized'] = df['text_normalized'].str.len().astype(int)
 86 | 		# list(zip(df['text_normalized'], df['text_normalized']))
 87 | 		# np.vstack([df['text_normalized'], df['text_normalized']]).T
 88 | 		with timer("Text length ratio vectorized: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings():
 89 | 			df['len_ratio'] = apply(div, batcher, vectorize=[float64(int64, int64)])(
 90 | 				df[['len_text', 'len_text_normalized']].values)
 91 | 		with timer("Text length ratio without vectorization: " + str(len(df)) + "," + backend[0]), \
 92 | 		     warnings.catch_warnings():
 93 | 			df['len_ratio'] = apply(lambda x:div(*x), batcher)(df[['len_text', 'len_text_normalized']].values)
 94 | 		with timer("Text length ratio without Wordbatch: " + str(len(df)) + "," + backend[0]), warnings.catch_warnings():
 95 | 			df['len_ratio'] = [div(x, y) for x, y in zip(df['len_text'], df['len_text_normalized'])]
 96 | 	except Exception as e:
 97 | 		print("Failed text length ratios: " +"," + str(len(df)) + "," + backend[0])
 98 | 	# #return
 99 | 
100 | 	try:
101 | 		with timer("Splitting first word: " + str(len(df)) + "," + backend[0]) \
102 | 				, warnings.catch_warnings():
103 | 			warnings.simplefilter("ignore")
104 | 			df['first_word'] = apply(lambda x: x.split(" ")[0], batcher)(df['text_normalized'])
105 | 		with timer("Splitting first word without Wordbatch: " + str(len(df)) + "," + backend[0]), \
106 | 			 warnings.catch_warnings():
107 | 			warnings.simplefilter("ignore")
108 | 			df['first_word'] = [x.split(" ")[0] for x in df['text_normalized']]
109 | 	except Exception as e:
110 | 		print("Failed splitting first word: " + str(len(df)) + "," + backend[0])
111 | 	# "Exception:", e.split("\n")[0])
112 | 
113 | 	try:
114 | 		with timer("Stemming first word: " + str(len(df)) + "," + backend[0]), \
115 | 		   warnings.catch_warnings():
116 | 			warnings.simplefilter("ignore")
117 | 			df['first_word_stemmed'] = apply(stemmer.stem, batcher)(df['first_word'])
118 | 		with timer("Stemming first word without Wordbatch: " + str(len(df)) + "," + backend[0]), \
119 | 		   warnings.catch_warnings():
120 | 			warnings.simplefilter("ignore")
121 | 			df['first_word_stemmed'] = [stemmer.stem(x) for x in df['first_word']]
122 | 	except Exception as e:
123 | 		print("Failed stemming first word: " + str(len(df)) + "," + backend[0])
124 | 	# # "Exception:", e.split("\n")[0])
125 | 	#
126 | 	try:
127 | 		with timer("Stemming first word (cache=1000): " + str(len(df)) + "," + backend[0]), \
128 | 		   warnings.catch_warnings():
129 | 			warnings.simplefilter("ignore")
130 | 			df['first_word_stemmed'] = apply(stemmer.stem, batcher, cache=1000)(df['first_word'])
131 | 
132 | 		with timer("Stemming first word (cache=1000) without Wordbatch: " + str(len(df)) + "," + backend[0]), \
133 | 		   warnings.catch_warnings():
134 | 			warnings.simplefilter("ignore")
135 | 			from functools import lru_cache
136 | 			cache_stem= lru_cache(maxsize=1000)(stemmer.stem)
137 | 			df['first_word_stemmed'] = [cache_stem(x) for x in df['first_word']]
138 | 	except Exception as e:
139 | 		print("Failed stemming first word: " + str(len(df)) + "," + backend[0])
140 | 	# "Exception:", e.split("\n")[0])
141 | 
142 | 	try:
143 | 		batcher.minibatch_size = 200
144 | 		with timer("Groupby aggregation: " + str(len(df)) + "," + backend[0]), \
145 | 		   warnings.catch_warnings():
146 | 			warnings.simplefilter("ignore")
147 | 			group_ids, groups = zip(*df[['first_word_stemmed', 'text']].groupby('first_word_stemmed'))
148 | 			res = apply(lambda x: x['text'].str.len().agg('mean'), batcher)(groups)
149 | 			df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map(
150 | 				{x: y for x, y in zip(group_ids, res)})
151 | 
152 | 		batcher.minibatch_size = 10
153 | 		df['first_word_stemmed_hashbin'] = [hash(x) % 500 for x in df['first_word_stemmed']]
154 | 		with timer("Groupby aggregation hashbin: " + str(len(df)) + "," + backend[0]), \
155 | 		     warnings.catch_warnings():
156 | 			warnings.simplefilter("ignore")
157 | 			group_ids, groups = zip(*df[['first_word_stemmed', 'text', 'first_word_stemmed_hashbin']]
158 | 			                        .groupby('first_word_stemmed_hashbin'))
159 | 			res = pd.concat(apply(lambda x: x.groupby('first_word_stemmed').apply(
160 | 				lambda z: z['text'].str.len().agg('mean')), batcher)(groups))
161 | 			df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map(res)
162 | 
163 | 		with timer("Groupby aggregation without Wordbatch: " + str(len(df)) + "," + backend[0]), \
164 | 			 warnings.catch_warnings():
165 | 			warnings.simplefilter("ignore")
166 | 			group_ids, groups = zip(*df[['first_word_stemmed', 'text']].groupby('first_word_stemmed'))
167 | 			res = [x['text'].str.len().agg('mean') for x in groups]
168 | 			df['first_word_stemmed_mean_text_len'] = df['first_word_stemmed'].map(
169 | 				{x: y for x, y in zip(group_ids, res)})
170 | 		del (res, group_ids, groups, df, batcher)
171 | 	except Exception as e:
172 | 		print("Failed groupby aggregation: " + str(len(df)) + "," + backend[0])
173 | 	# "Exception:", e.split("\n")[0])
174 | 
175 | texts= texts[:data_size]
176 | if __name__ == '__main__':
177 | 	for backend in backends:
178 | 		test_backend(texts, backend)
179 | 


--------------------------------------------------------------------------------
/scripts/wordbag_regressor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | from __future__ import division
 3 | from __future__ import absolute_import
 4 | from __future__ import print_function
 5 | import os
 6 | import re
 7 | import json
 8 | import gzip
 9 | from wordbatch.pipelines import WordBatch
10 | from wordbatch.models import FTRL
11 | from wordbatch.extractors import WordBag
12 | import threading
13 | import sys
14 | if sys.version_info.major == 3:
15 | 	import pickle as pkl
16 | else:
17 | 	import cPickle as pkl
18 | 
19 | 
20 | non_alphanums = re.compile('[\W+]')
21 | nums_re= re.compile("\W*[0-9]+\W*")
22 | triples_re= re.compile(r"(\w)\1{2,}")
23 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"),
24 | 		   re.compile(" '"),re.compile("  +")]
25 | from nltk.stem.porter import PorterStemmer
26 | stemmer= PorterStemmer()
27 | def normalize_text(text):
28 | 	text= text.lower()
29 | 	text= nums_re.sub(" NUM ", text)
30 | 	text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1])
31 | 	return text
32 | 
33 | class WordbagRegressor(object):
34 | 	def __init__(self, pickle_model="", datadir=None, batcher= None):
35 | 		self.wb = WordBatch(normalize_text=normalize_text,
36 | 		                    extractor=WordBag(hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0],
37 | 												hash_size=2**23, norm='l2', tf='binary', idf=50.0),
38 | 		                    batcher= batcher)
39 | 
40 | 		self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity")
41 | 		if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
42 | 		else: self.train(datadir, pickle_model)
43 | 
44 | 	def fit_batch(self, texts, labels, rcount):
45 | 		texts, labels= self.wb.batcher.shuffle_batch(texts, labels, rcount)
46 | 		print("Transforming", rcount)
47 | 		texts= self.wb.fit_transform(texts, reset= False)
48 | 		print("Training", rcount)
49 | 		self.clf.fit(texts, labels, reset= False)
50 | 
51 | 	def train(self, datadir, pickle_model=""):
52 | 		texts= []
53 | 		labels= []
54 | 		training_data= os.listdir(datadir)
55 | 		rcount= 0
56 | 		batchsize= 100000
57 | 
58 | 		p = None
59 | 		for jsonfile in training_data:
60 | 			with open(datadir + "/" + jsonfile, 'r') as inputfile:
61 | 				for line in inputfile:
62 | 					#if rcount > 1000000: break
63 | 					try: line = json.loads(line.strip())
64 | 					except:  continue
65 | 					for review in line["Reviews"]:
66 | 						rcount+= 1
67 | 						if rcount % 100000 == 0:  print(rcount)
68 | 						if rcount % 7 != 0: continue
69 | 						if "Overall" not in review["Ratings"]: continue
70 | 						texts.append(review["Content"])
71 | 						labels.append((float(review["Ratings"]["Overall"]) - 3) * 0.5)
72 | 						if len(texts) % batchsize == 0:
73 | 							if p != None:  p.join()
74 | 							p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
75 | 							p.start()
76 | 							texts= []
77 | 							labels= []
78 | 		if p != None:  p.join()
79 | 		self.fit_batch(texts, labels, rcount)
80 | 
81 | 		self.wb.dictionary_freeze= True
82 | 
83 | 		if pickle_model!="":
84 | 			with gzip.open(pickle_model, 'wb') as model_file:
85 | 				backend= self.wb.batcher.backend
86 | 				backend_handle= self.wb.batcher.backend_handle
87 | 				self.wb.batcher.backend= "serial"
88 | 				self.wb.batcher.backend_handle = None
89 | 				pkl.dump((self.wb, self.clf), model_file, protocol=2)
90 | 				self.wb.batcher.backend = backend
91 | 				self.wb.batcher.backend_handle = backend_handle
92 | 
93 | 	def predict(self, texts):
94 | 		counts= self.wb.transform(texts)
95 | 		return self.clf.predict(counts)


--------------------------------------------------------------------------------
/scripts/wordbag_regressor_spark.py:
--------------------------------------------------------------------------------
  1 | from __future__ import with_statement
  2 | from __future__ import division
  3 | from __future__ import absolute_import
  4 | from __future__ import print_function
  5 | import os
  6 | import re
  7 | import json
  8 | import gzip
  9 | import wordbatch
 10 | from wordbatch.models import FTRL
 11 | from wordbatch.extractors import WordBag
 12 | import threading
 13 | import pandas as pd
 14 | import sys
 15 | if sys.version_info.major == 3:
 16 | 	import pickle as pkl
 17 | else:
 18 | 	import cPickle as pkl
 19 | 
 20 | non_alphanums = re.compile('[\W]')
 21 | nums_re= re.compile("\W*[0-9]+\W*")
 22 | triples_re= re.compile(r"(\w)\1{2,}")
 23 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"),
 24 | 		   re.compile(" '"),re.compile("  +")]
 25 | from nltk.stem.porter import PorterStemmer
 26 | stemmer= PorterStemmer()
 27 | def normalize_text(text):
 28 | 	text= text.lower()
 29 | 	text= nums_re.sub(" NUM ", text)
 30 | 	text= " ".join([word for word in non_alphanums.sub(" ",text).split() if len(word)>1])
 31 | 	return text
 32 | 
 33 | class WordbagRegressor(object):
 34 | 	def __init__(self, pickle_model="", datadir=None):
 35 | 		from pyspark import SparkContext
 36 | 		self.sc= SparkContext()
 37 | 		self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc,
 38 | 		                                     extractor=(WordBag, {"hash_ngrams":3,
 39 | 		                                                          "hash_ngrams_weights":[-1.0, -1.0, 1.0],
 40 | 		                                                          "hash_size":2**23, "norm":'l2',
 41 | 		                                                          "tf":'binary', "idf":50.0}))
 42 | 		self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity")
 43 | 		if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
 44 | 		else: self.train(datadir, pickle_model)
 45 | 
 46 | 	def fit_batch(self, texts, labels, rcount):
 47 | 		print("Transforming", rcount)
 48 | 		# if self.sc != None:
 49 | 		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc)
 50 | 		# 	data_rdd= self.wordbatch.transform(data_rdd)
 51 | 		# 	[texts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
 52 | 		# else:
 53 | 		# print(texts[:2])
 54 | 		# print(pd.Series(labels).value_counts())
 55 | 		texts= self.wordbatch.partial_fit_transform(texts)
 56 | 		print("Training", rcount)
 57 | 		self.clf.partial_fit(texts, labels)
 58 | 
 59 | 	def train(self, datadir, pickle_model=""):
 60 | 		texts= []
 61 | 		labels= []
 62 | 		training_data= os.listdir(datadir)
 63 | 		rcount= 0
 64 | 		batchsize= 20000
 65 | 
 66 | 		p = None
 67 | 		for jsonfile in training_data:
 68 | 			with open(datadir + "/" + jsonfile, 'r') as inputfile:
 69 | 				for line in inputfile:
 70 | 					#if rcount > 1000000: break
 71 | 					try: line = json.loads(line.strip())
 72 | 					except:  continue
 73 | 					for review in line["Reviews"]:
 74 | 						rcount+= 1
 75 | 						if rcount % 100000 == 0:  print(rcount)
 76 | 						if rcount % 7 != 0: continue
 77 | 						if "Overall" not in review["Ratings"]: continue
 78 | 						texts.append(review["Content"])
 79 | 						labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
 80 | 						if len(texts) % batchsize == 0:
 81 | 							if p != None:  p.join()
 82 | 							p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
 83 | 							p.start()
 84 | 							texts= []
 85 | 							labels= []
 86 | 		if p != None:  p.join()
 87 | 		self.fit_batch(texts, labels, rcount)
 88 | 
 89 | 		self.wordbatch.dictionary_freeze= True
 90 | 
 91 | 		if pickle_model!="":
 92 | 			with gzip.open(pickle_model, 'wb') as model_file:
 93 | 				pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)
 94 | 
 95 | 	def predict(self, texts):
 96 | 		# if self.sc != None:
 97 | 		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
 98 | 		# 	data_rdd= self.wordbatch.transform(data_rdd)
 99 | 		# 	[counts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
100 | 		# else:
101 | 		counts= self.wordbatch.transform(texts)
102 | 		return self.clf.predict(counts)
103 | 
104 | 	def predict_parallel(self, texts):
105 | 		# if self.sc != None:
106 | 		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, []] , self.sc)
107 | 		# 	counts_rdd= self.wordbatch.transform(data_rdd)
108 | 		# 	return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0]
109 | 		counts= self.wordbatch.transform(texts)
110 | 		return self.wordbatch.predict_parallel(counts, self.clf)
111 | 
112 | if __name__ == "__main__":
113 | 	df= pd.DataFrame.from_csv("../data/Tweets.csv", encoding="utf8")
114 | 	def sentiment_to_label(sentiment):
115 | 		if sentiment=="neutral":  return 0
116 | 		if sentiment=="negative":  return -1
117 | 		return 1
118 | 
119 | 	df['airline_sentiment_confidence']= df['airline_sentiment_confidence'].astype('str')
120 | 	df['sentiment']= (df['airline_sentiment']).apply(lambda x: sentiment_to_label(x))
121 | 	df= df[['text','sentiment']]
122 | 
123 | 	re_attags= re.compile(" @[^ ]* ")
124 | 	re_spaces= re.compile("\w+]")
125 | 	df['text']= df['text'].apply(lambda x: re_spaces.sub(" ",re_attags.sub(" ", " "+x+" "))[1:-1])
126 | 	df= df.drop_duplicates(subset=['text'])
127 | 	df.index= df['id']= range(df.shape[0])
128 | 
129 | 	non_alphanums=re.compile('[^A-Za-z]+')
130 | 	def normalize_text(text): return non_alphanums.sub(' ', text).lower().strip()
131 | 	df['text_normalized']= df['text'].map(lambda x: normalize_text(x))
132 | 
133 | 	import wordbag_regressor
134 | 	print("Train wordbag regressor")
135 | 	wb_regressor= WordbagRegressor("", "../../../data/tripadvisor/json")
136 | 	df['wordbag_score']= wb_regressor.predict(df['text'].values)
137 | 	print(df['wordbag_score'].value_counts())


--------------------------------------------------------------------------------
/scripts/wordhash_regressor.py:
--------------------------------------------------------------------------------
  1 | from __future__ import with_statement
  2 | from __future__ import division
  3 | from __future__ import absolute_import
  4 | from __future__ import print_function
  5 | import os
  6 | import re
  7 | import json
  8 | import gzip
  9 | import scipy.sparse as ssp
 10 | from wordbatch.pipelines import WordBatch
 11 | from wordbatch.extractors import WordHash
 12 | from wordbatch.models import FM_FTRL
 13 | from wordbatch.transformers import Tokenizer
 14 | import threading
 15 | import multiprocessing
 16 | import sys
 17 | if sys.version_info.major == 3:
 18 | 	import pickle as pkl
 19 | else:
 20 | 	import cPickle as pkl
 21 | 
 22 | non_alphanums = re.compile('[\W+]')
 23 | nums_re= re.compile("\W*[0-9]+\W*")
 24 | triples_re= re.compile(r"(\w)\1{2,}")
 25 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"),
 26 | 		   re.compile(" '"),re.compile("  +")]
 27 | from nltk.stem.porter import PorterStemmer
 28 | stemmer= PorterStemmer()
 29 | def normalize_text(text):
 30 | 	text= text.lower()
 31 | 	text= nums_re.sub(" NUM ", text)
 32 | 	text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1])
 33 | 	return text
 34 | 
 35 | class BatchData(object):
 36 | 	def __init__(self):
 37 | 		self.texts= None
 38 | 
 39 | class WordhashRegressor(object):
 40 | 	def __init__(self, pickle_model="", datadir=None, batcher= None):
 41 | 		self.wb= WordBatch(normalize_text, tokenizer= Tokenizer(stemmer=stemmer), extractor=WordHash(
 42 | 			decode_error='ignore', n_features=2 ** 25, ngram_range=(1,2), norm='l2'), batcher= batcher)
 43 | 		self.clf = FM_FTRL(D=2 ** 25, D_fm= 4, iters=1, inv_link="identity", threads= multiprocessing.cpu_count()//2)
 44 | 		if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
 45 | 		else: self.train(datadir, pickle_model)
 46 | 
 47 | 	def transform_batch(self, texts, batch_data):
 48 | 		batch_data.texts= self.wb.fit_transform(texts, reset= False)
 49 | 
 50 | 	def train(self, datadir, pickle_model=""):
 51 | 		texts= []
 52 | 		labels= []
 53 | 		training_data= os.listdir(datadir)
 54 | 		rcount= 0
 55 | 		texts2= []
 56 | 		batchsize= 100000
 57 | 
 58 | 		batch_data = BatchData()
 59 | 		p_input= None
 60 | 		for jsonfile in training_data:
 61 | 			with open(datadir + "/" + jsonfile, 'r') as inputfile:
 62 | 				for line in inputfile:
 63 | 					# if rcount > 1000000: break
 64 | 					try:  line = json.loads(line.strip())
 65 | 					except:  continue
 66 | 					for review in line["Reviews"]:
 67 | 						rcount+= 1
 68 | 						if rcount % 100000 == 0:  print(rcount)
 69 | 						if rcount % 9 != 0: continue
 70 | 						if "Overall" not in review["Ratings"]: continue
 71 | 						texts.append(review["Content"])
 72 | 						labels.append((float(review["Ratings"]["Overall"]) - 3) * 0.5)
 73 | 						if len(texts) % batchsize == 0:
 74 | 							if p_input != None:
 75 | 								p_input.join()
 76 | 								texts2.append(batch_data.texts)
 77 | 							p_input = threading.Thread(target=self.transform_batch, args=(texts, batch_data))
 78 | 							p_input.start()
 79 | 							texts= []
 80 | 		if p_input != None:
 81 | 			p_input.join()
 82 | 			texts2.append(batch_data.texts)
 83 | 			texts2.append(self.wb.fit_transform(texts, reset= False))
 84 | 		del (texts)
 85 | 		if len(texts2) == 1:  texts= texts2[0]
 86 | 		else:  texts= ssp.vstack(texts2)
 87 | 
 88 | 		self.wb.dictionary_freeze = True
 89 | 
 90 | 		self.clf.fit(texts, labels)
 91 | 		if pickle_model != "":
 92 | 			with gzip.open(pickle_model, 'wb') as model_file:
 93 | 				backend= self.wb.batcher.backend
 94 | 				backend_handle= self.wb.batcher.backend_handle
 95 | 				self.wb.batcher.backend= "serial"
 96 | 				self.wb.batcher.backend_handle = None
 97 | 				pkl.dump((self.wb, self.clf), model_file, protocol=2)
 98 | 				self.wb.batcher.backend = backend
 99 | 				self.wb.batcher.backend_handle = backend_handle
100 | 
101 | 	def predict(self, texts):
102 | 		counts= self.wb.transform(texts)
103 | 		return self.clf.predict(counts)


--------------------------------------------------------------------------------
/scripts/wordseq_regressor.py:
--------------------------------------------------------------------------------
  1 | from __future__ import with_statement
  2 | from __future__ import division
  3 | from __future__ import absolute_import
  4 | from __future__ import print_function
  5 | import pickle as pkl
  6 | import gzip
  7 | import re
  8 | import os
  9 | import json
 10 | import scipy as sp
 11 | import numpy as np
 12 | from tensorflow.keras.layers import *
 13 | from tensorflow.keras.models import Sequential
 14 | from wordbatch.pipelines import WordBatch
 15 | from wordbatch.extractors import WordSeq
 16 | from wordbatch.transformers import Dictionary
 17 | import random
 18 | import threading
 19 | from tensorflow.keras.models import load_model
 20 | import tensorflow as tf
 21 | import sys
 22 | 
 23 | non_alphas = re.compile('[^A-Za-z\'-]+')
 24 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "),
 25 |            re.compile("[-']{2,}"),re.compile(" '"),re.compile("  +")]
 26 | 
 27 | def normalize_text(text):
 28 |     text= text.lower()
 29 |     for x in trash_re:
 30 |         while x.search(text) != None:  text = x.sub(" ", text)
 31 |     return non_alphas.sub(' ', text).strip()
 32 | 
 33 | class BatchData(object):
 34 |     def __init__(self):
 35 |         self.texts= None
 36 |         self.labels= None
 37 | 
 38 | class WordseqRegressor():
 39 |     def __init__(self, pickle_model="", datadir=None, batcher= None):
 40 |         seed = 10002
 41 |         os.environ['PYTHONHASHSEED'] = str(seed)
 42 |         np.random.seed(seed + 1)
 43 |         random.seed(seed + 2)
 44 |         tf.random.set_seed(seed+3)
 45 | 
 46 |         self.maxlen = 200
 47 |         self.max_words = 20000
 48 |         self.wb= WordBatch(normalize_text, dictionary=Dictionary(max_words=self.max_words),
 49 |                            extractor=WordSeq(seq_maxlen=self.maxlen), batcher=batcher)
 50 |         self.model = Sequential()
 51 |         self.model.add(Embedding(self.max_words+2, 20, input_length=self.maxlen))
 52 |         self.model.add(Conv1D(activation="relu", padding="same", strides=1, filters=10, kernel_size=3))
 53 |         self.model.add(Dropout(0.5))
 54 |         self.model.add(BatchNormalization())
 55 |         self.model.add(GlobalMaxPooling1D())
 56 |         self.model.add(Dense(1))
 57 |         self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
 58 |         if datadir == None:
 59 |             self.model= load_model(pickle_model)
 60 |             self.wb= pkl.load(gzip.open(pickle_model + ".wb", 'rb'))
 61 |         else: self.train(datadir, pickle_model)
 62 | 
 63 |     def transform_batch(self, texts, batch_data):
 64 |         batch_data.texts= self.wb.fit_transform(texts, reset= False)
 65 | 
 66 |     def train(self, datadir, pickle_model=""):
 67 |         texts= []
 68 |         labels= []
 69 |         training_data = os.listdir(datadir)
 70 |         rcount= 0
 71 |         texts2= []
 72 |         batchsize= 100000
 73 | 
 74 |         batch_data = BatchData()
 75 |         p_input= None
 76 |         for jsonfile in training_data:
 77 |             with open(datadir + "/" + jsonfile, 'r') as inputfile:
 78 |                 for line in inputfile:
 79 |                     #if rcount > 1000000: break
 80 |                     try: line= json.loads(line.strip())
 81 |                     except:  continue
 82 |                     for review in line["Reviews"]:
 83 |                         rcount+= 1
 84 |                         if rcount % 100000 == 0:  print(rcount)
 85 |                         if rcount % 8 != 0: continue
 86 |                         if "Overall" not in review["Ratings"]: continue
 87 |                         texts.append(review["Content"])
 88 |                         labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
 89 |                         if len(texts) % batchsize == 0:
 90 |                             if p_input != None:
 91 |                                 p_input.join()
 92 |                                 texts2.append(batch_data.texts)
 93 |                             p_input = threading.Thread(target=self.transform_batch, args=(texts, batch_data))
 94 |                             p_input.start()
 95 |                             texts= []
 96 |         if p_input != None:
 97 |             p_input.join()
 98 |             texts2.append(batch_data.texts)
 99 |         texts2.append(self.wb.partial_fit_transform(texts))
100 |         del(texts)
101 |         texts= sp.vstack(texts2)
102 |         self.wb.dictionary_freeze = True
103 |         test= (np.array(texts[-1000:]), np.array(labels[-1000:]))
104 |         train = (np.array(texts[:-1000]), np.array(labels[:-1000]))
105 | 
106 |         self.model.fit(train[0], train[1], batch_size=2048, epochs=2, validation_data=(test[0], test[1]))
107 |         if pickle_model != "":
108 |             self.model.save_weights(pickle_model)
109 |             backend = self.wb.batcher.backend
110 |             backend_handle = self.wb.batcher.backend_handle
111 |             self.wb.batcher.backend = "serial"
112 |             self.wb.batcher.backend_handle = None
113 |             with gzip.open(pickle_model + ".wb", 'wb') as model_file:  pkl.dump(self.wb, model_file, protocol=2)
114 |             self.wb.batcher.backend = backend
115 |             self.wb.batcher.backend_handle = backend_handle
116 | 
117 |     def predict_batch(self, texts):
118 |         results= [x[0] for x in self.model.predict(np.array(self.wb.transform(texts)))]
119 |         return results
120 | 


--------------------------------------------------------------------------------
/scripts/wordvec_regressor.py:
--------------------------------------------------------------------------------
  1 | from __future__ import with_statement
  2 | from __future__ import division
  3 | from __future__ import absolute_import
  4 | from __future__ import print_function
  5 | import os
  6 | import re
  7 | import json
  8 | import gzip
  9 | from wordbatch.pipelines import WordBatch
 10 | from wordbatch.models import FTRL
 11 | from wordbatch.extractors import WordVec, Hstack
 12 | from sklearn.utils import shuffle
 13 | import threading
 14 | import sys
 15 | if sys.version_info.major == 3:
 16 | 	import pickle as pkl
 17 | else:
 18 | 	import cPickle as pkl
 19 | 
 20 | non_alphanums = re.compile('[\W+]')
 21 | nums_re= re.compile("\W*[0-9]+\W*")
 22 | trash_re= [re.compile("<[^>]*>"), re.compile("[^a-z0-9' -]+"), re.compile(" [.0-9'-]+ "), re.compile("[-']{2,}"),
 23 | 		   re.compile(" '"),re.compile("  +")]
 24 | 
 25 | def normalize_text(text):
 26 | 	text= text.lower()
 27 | 	text= nums_re.sub(" NUM ", text)
 28 | 	text= " ".join([word for word in non_alphanums.sub(" ",text).strip().split() if len(word)>1])
 29 | 	return text
 30 | 
 31 | class WordvecRegressor(object):
 32 | 	def __init__(self, pickle_model="", datadir=None, batcher=None):
 33 | 		self.wb= WordBatch(normalize_text, extractor=Hstack([
 34 | 		    WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text,
 35 | 		            encoding="utf8"),
 36 | 		    WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text,
 37 | 		            encoding="utf8")]))
 38 | 		# from wordbatch.pipelines import FeatureUnion
 39 | 		# from wordbatch.transformers import Dictionary, TextNormalizer
 40 | 		# from sklearn.pipeline import Pipeline
 41 | 		# tn= TextNormalizer(normalize_text=normalize_text)
 42 | 		# dct= Dictionary()
 43 | 		# vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
 44 | 		# 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
 45 | 		# vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
 46 | 		# 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
 47 | 		# self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
 48 | 		self.batcher= batcher
 49 | 
 50 | 		self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100+50, iters=1, inv_link= "identity")
 51 | 
 52 | 		if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
 53 | 		else: self.train(datadir, pickle_model)
 54 | 
 55 | 	def fit_batch(self, texts, labels, rcount):
 56 | 		texts, labels = shuffle(texts, labels)
 57 | 		print("Transforming", rcount)
 58 | 		#texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher)
 59 | 		texts = self.wb.fit_transform(texts)
 60 | 		print("Training", rcount)
 61 | 		self.clf.fit(texts, labels, reset= False)
 62 | 
 63 | 	def train(self, datadir, pickle_model=""):
 64 | 		texts= []
 65 | 		labels= []
 66 | 		training_data= os.listdir(datadir)
 67 | 		rcount= 0
 68 | 		batchsize= 80000
 69 | 
 70 | 		p= None
 71 | 		for jsonfile in training_data:
 72 | 			with open(datadir + "/" + jsonfile, 'r') as inputfile:
 73 | 				for line in inputfile:
 74 | 					#if rcount > 1000000: break
 75 | 					try: line= json.loads(line.strip())
 76 | 					except:  continue
 77 | 					for review in line["Reviews"]:
 78 | 						rcount+= 1
 79 | 						if rcount % 100000 == 0:  print(rcount)
 80 | 						if rcount % 6 != 0: continue
 81 | 						if "Overall" not in review["Ratings"]: continue
 82 | 						texts.append(review["Content"])
 83 | 						labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
 84 | 						if len(texts) % batchsize == 0:
 85 | 							if p != None:  p.join()
 86 | 							p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
 87 | 							p.start()
 88 | 							texts= []
 89 | 							labels= []
 90 | 		if p != None:  p.join()
 91 | 		self.fit_batch(texts, labels, rcount)
 92 | 
 93 | 		# if pickle_model!="":
 94 | 		# 	with gzip.open(pickle_model, 'wb') as model_file:
 95 | 		# 		backend = self.wb.batcher.backend
 96 | 		# 		backend_handle = self.wb.batcher.backend_handle
 97 | 		# 		self.wb.batcher.backend = "serial"
 98 | 		# 		self.wb.batcher.backend_handle = None
 99 | 		# 		pkl.dump((self.wb, self.clf), model_file, protocol=2)
100 | 		# 		self.wb.batcher.backend = backend
101 | 		# 		self.wb.batcher.backend_handle = backend_handle
102 | 
103 | 	def predict(self, texts):
104 | 		vecs= self.wb.transform(texts)
105 | 		return self.clf.predict(vecs)


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build =
3 | tag_date = 0
4 | tag_svn_revision = 0
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup, Extension
 3 | from Cython.Distutils import build_ext
 4 | import numpy
 5 | import os
 6 | 
 7 | if os.name == 'nt':
 8 | 	extra_compile_args = ["/openmp", "/Ox", "/arch:AVX2", "/fp:fast"]
 9 | 	extra_link_args = []
10 | else:
11 | 	extra_compile_args = ["-O3", "-fopenmp", "-ffast-math", "-mavx2", "-ftree-vectorize", "-std=gnu11"]
12 | 	extra_link_args = ["-fopenmp"]
13 | 
14 | setup(
15 | 	name='Wordbatch',
16 | 	version='1.4.9',
17 | 	description='Python library for distributed AI processing pipelines, using swappable scheduler backends',
18 | 	url='https://github.com/anttttti/Wordbatch',
19 | 	author='Antti Puurula',
20 | 	author_email='antti.puurula@yahoo.com',
21 | 	packages=['wordbatch',
22 | 			  'wordbatch.pipelines',
23 | 			  'wordbatch.extractors',
24 | 			  'wordbatch.models',
25 | 			  'wordbatch.transformers'
26 | 	],
27 | 	license='GNU GPL 2.0',
28 | 	classifiers=[
29 | 		"Development Status :: 4 - Beta",
30 | 		"Intended Audience :: Developers",
31 | 		"License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
32 | 		"Programming Language :: Python :: 3.6",
33 | 		"Programming Language :: Python :: 3.7",
34 | 		"Programming Language :: Python :: 3.8",
35 | 		"Programming Language :: Python :: 3.9",
36 | 		"Programming Language :: Python :: 3.10",
37 | 		"Programming Language :: Cython",
38 | 		"Topic :: Scientific/Engineering :: Artificial Intelligence",
39 | 		"Topic :: Software Development :: Libraries :: Python Modules",
40 | 	],
41 | 	install_requires=['Cython', 'scikit-learn', 'python-Levenshtein', 'py-lz4framed', 'randomgen==1.21.2', 'numpy',
42 | 					  'scipy', 'pandas', 'wheel>=0.33.4'],
43 | 	extras_require={'dev': ['nltk', 'textblob', 'keras', 'pyspark', 'dask', 'distributed', 'ray']},
44 | 
45 | 	cmdclass= {'build_ext': build_ext},
46 | 	ext_modules= [
47 | 				  Extension("wordbatch.extractors.extractors",
48 | 							["wordbatch/extractors/extractors.pyx", "wordbatch/extractors/MurmurHash3.cpp"],
49 | 							libraries= [],
50 | 							include_dirs=[numpy.get_include(), '.'],
51 | 							extra_compile_args = extra_compile_args,
52 | 							extra_link_args=extra_link_args),
53 | 				  Extension("wordbatch.models.ftrl",
54 | 							["wordbatch/models/ftrl.pyx"],
55 | 							libraries=[],
56 | 							include_dirs=[numpy.get_include(), '.'],
57 | 							extra_compile_args=extra_compile_args,
58 | 							extra_link_args=extra_link_args),
59 | 				  Extension("wordbatch.models.ftrl32",
60 | 							["wordbatch/models/ftrl32.pyx"],
61 | 							libraries=[],
62 | 							include_dirs=[numpy.get_include(), '.'],
63 | 							extra_compile_args=extra_compile_args,
64 | 							extra_link_args=extra_link_args),
65 | 				  Extension("wordbatch.models.fm_ftrl",
66 | 							["wordbatch/models/fm_ftrl.pyx", "wordbatch/models/avx_ext.c"],
67 | 							libraries= [],
68 | 							include_dirs=[numpy.get_include(), '.'],
69 | 							extra_compile_args = extra_compile_args,
70 | 							extra_link_args=extra_link_args),
71 | 				  Extension("wordbatch.models.nn_relu_h1",
72 | 							["wordbatch/models/nn_relu_h1.pyx"],
73 | 							libraries= [],
74 | 							include_dirs=[numpy.get_include(), '.'],
75 | 							extra_compile_args = extra_compile_args,
76 | 							extra_link_args=extra_link_args),
77 | 				  Extension("wordbatch.models.nn_relu_h2",
78 | 							["wordbatch/models/nn_relu_h2.pyx"],
79 | 							libraries= [],
80 | 							include_dirs=[numpy.get_include(), '.'],
81 | 							extra_compile_args = extra_compile_args,
82 | 							extra_link_args=extra_link_args),
83 | 		]
84 | )
85 | 


--------------------------------------------------------------------------------
/wordbatch/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
3 | __version__ = '1.4.9'
4 | 
5 | 


--------------------------------------------------------------------------------
/wordbatch/batcher.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import with_statement
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from __future__ import print_function
  6 | import multiprocessing
  7 | from contextlib import closing
  8 | import scipy.sparse as ssp
  9 | import random
 10 | import pandas as pd
 11 | from math import ceil
 12 | 
 13 | 
 14 | class Batcher(object):
 15 | 	"""Scheduler to handle parallel jobs on minibatches
 16 | 
 17 | 	Parameters
 18 | 	----------
 19 | 	procs: int
 20 | 		Number of process(es)/thread(s) for executing task in parallel. Used for multiprocessing, threading and Loky
 21 | 
 22 | 	minibatch_size: int
 23 | 		Expected size of each minibatch
 24 | 
 25 | 	backend: {'serial', 'multiprocessing', 'threading', 'loky', 'spark', 'dask', 'ray'}
 26 | 		Backend for computing the tasks
 27 | 
 28 | 			- 'serial' sequential execution without a backend scheduler
 29 | 
 30 | 			- 'multiprocessing' Python standard multiprocessing library
 31 | 
 32 | 			- 'threading' Python standard threading library
 33 | 
 34 | 			- 'loky' Loky fork of multiprocessing library
 35 | 
 36 | 			- 'spark' PySpark local or distributed execution
 37 | 
 38 | 			- 'dask' Dask Distributed local or distributed execution
 39 | 
 40 | 			- 'ray' Ray local or distributed execution
 41 | 
 42 | 	task_num_cpus: int
 43 | 		Number of CPUs to reserve per minibatch task for Ray
 44 | 
 45 | 	task_num_gpus: int
 46 | 		Number of GPUs to reserve per minibatch task for Ray
 47 | 
 48 | 	backend_handle: object
 49 | 		Backend handle for sending tasks
 50 | 
 51 | 	verbose: int
 52 | 		Verbosity level. Setting verbose > 0 will display additional information depending on the specific level set.
 53 | 	"""
 54 | 	def __init__(self, procs= 0, minibatch_size= 20000, backend_handle= None, backend= "multiprocessing",
 55 | 				 task_num_cpus= 1, task_num_gpus= 0, verbose= 0):
 56 | 		if procs == 0 or procs is None:  procs= multiprocessing.cpu_count()
 57 | 		self.procs= procs
 58 | 		self.verbose= verbose
 59 | 		self.minibatch_size= minibatch_size
 60 | 		self.backend_handle= backend_handle
 61 | 		self.backend= backend
 62 | 		self.task_num_cpus = task_num_cpus
 63 | 		self.task_num_gpus = task_num_gpus
 64 | 
 65 | 	def list2indexedrdd(self, lst, minibatch_size=0):
 66 | 		if minibatch_size==0:  minibatch_size= self.minibatch_size
 67 | 		start= 0; len_data= len(lst); batch_count= 0
 68 | 		batches= []
 69 | 		while start < len_data:
 70 | 			batches.append([batch_count]+[lst[start:start + minibatch_size]])
 71 | 			start+= minibatch_size
 72 | 			batch_count+= 1
 73 | 		return self.backend_handle.parallelize(batches)
 74 | 
 75 | 	def indexedrdd2list(self, indexedrdd, sort= True):
 76 | 		batches= indexedrdd.collect()
 77 | 		if sort:  batches= sorted(batches)
 78 | 		return [batch[1] for batch in batches]
 79 | 
 80 | 	def split_batches(self, data, minibatch_size= None, backend= None):
 81 | 		"""Split data into minibatches with a specified size
 82 | 
 83 | 		Parameters
 84 | 		----------
 85 | 		data: iterable and indexable
 86 | 			List-like data to be split into batches. Includes backend_handleipy matrices and Pandas DataFrames.
 87 | 
 88 | 		minibatch_size: int
 89 | 			Expected sizes of minibatches split from the data.
 90 | 
 91 | 		backend: object
 92 | 			Backend to use, instead of the Batcher backend attribute
 93 | 
 94 | 		Returns
 95 | 		-------
 96 | 		data_split: list
 97 | 			List of minibatches, each entry is a list-like object representing the data subset in a batch.
 98 | 		"""
 99 | 		if minibatch_size is None:  minibatch_size= self.minibatch_size
100 | 		if backend is None:  backend= self.backend
101 | 		if isinstance(data, list) or isinstance(data, tuple) or isinstance(data, dict):  len_data= len(data)
102 | 		else:  len_data= data.shape[0]
103 | 		if backend=="spark":  return self.list2indexedrdd(data, minibatch_size)
104 | 		if isinstance(data,pd.DataFrame):
105 | 			data= [data.iloc[x * minibatch_size:(x + 1) * minibatch_size] for x in
106 | 						  range(int(ceil(len_data / minibatch_size)))]
107 | 		elif isinstance(data, dict):
108 | 			data = [dict(list(data.items())[x * minibatch_size:min(len_data, (x + 1) * minibatch_size)])
109 | 			        for x in range(int(ceil(len_data / minibatch_size)))]
110 | 		else:
111 | 			data= [data[x* minibatch_size:min(len_data, (x+1)*minibatch_size)]
112 | 					 for x in range(int(ceil(len_data/minibatch_size)))]
113 | 		###if backend=="dask":  return self.backend_handle.scatter(data)
114 | 		return data
115 | 
116 | 	def collect_batches(self, data, backend= None, sort= True):
117 | 		if backend is None:  backend= self.backend
118 | 		if backend == "spark":  data=  self.indexedrdd2list(data, sort)
119 | 		if backend == "dask":  data = self.backend_handle.gather(data)
120 | 		return data
121 | 
122 | 	def merge_batches(self, data):
123 | 		"""Merge a list of data minibatches into one single instance representing the data
124 | 
125 | 		Parameters
126 | 		----------
127 | 		data: list
128 | 			List of minibatches to merge
129 | 
130 | 		Returns
131 | 		-------
132 | 		(anonymous): sparse matrix | pd.DataFrame | list
133 | 			Single complete list-like data merged from given batches
134 | 		"""
135 | 		if isinstance(data[0], ssp.csr_matrix):  return ssp.vstack(data)
136 | 		if isinstance(data[0], pd.DataFrame) or isinstance(data[0], pd.Series):  return pd.concat(data)
137 | 		return [item for sublist in data for item in sublist]
138 | 
139 | 	def process_batches(self, task, data, args, backend=None, backend_handle=None, input_split=False,
140 | 	                    merge_output= True, minibatch_size= None, procs=None, task_num_cpus= None,
141 | 						task_num_gpus= None, verbose= None):
142 | 		"""
143 | 
144 | 		Parameters
145 | 		----------
146 | 		task: function
147 | 			Function to apply on each minibatch with other specified arguments
148 | 
149 | 		data: list-like
150 | 			Samples to split into minibatches and apply the specified function on
151 | 
152 | 		args: list
153 | 			Arguments to pass to the specified function following the mini-batch
154 | 
155 | 		input_split: boolean, default False
156 | 			If True, input data is already mapped into minibatches, otherwise data will be split on call.
157 | 
158 | 		merge_output: boolean, default True
159 | 			If True, results from minibatches will be reduced into one single instance before return.
160 | 
161 | 		procs: int
162 | 			Number of process(es)/thread(s) for executing task in parallel. Used for multiprocessing, threading,
163 | 			Loky and Ray
164 | 
165 | 		minibatch_size: int
166 | 			Expected size of each minibatch
167 | 
168 | 		backend: {'serial', 'multiprocessing', 'threading', 'loky', 'spark', 'dask', 'ray'}
169 | 			Backend for computing the tasks
170 | 
171 | 				- 'serial' sequential execution without a backend scheduler
172 | 
173 | 				- 'multiprocessing' Python standard multiprocessing library
174 | 	
175 | 				- 'threading' Python standard threading library
176 | 
177 | 				- 'loky' Loky fork of multiprocessing library
178 | 
179 | 				- 'spark' PySpark local or distributed execution
180 | 
181 | 				- 'dask' Dask Distributed local or distributed execution
182 | 
183 | 				- 'ray' Ray local or distributed execution
184 | 
185 | 		backend_handle: object
186 | 			Backend handle for sending tasks
187 | 
188 | 		task_num_cpus: int
189 | 			Number of CPUs to reserve per minibatch task for Ray
190 | 
191 | 		task_num_gpus: int
192 | 			Number of GPUs to reserve per minibatch task for Ray
193 | 
194 | 		verbose: int
195 | 			Verbosity level. Setting verbose > 0 will display additional information depending on the specific level set.
196 | 
197 | 		Returns
198 | 		-------
199 | 		results: list-like | list of list-like
200 | 			If merge_output is specified as True, this will be a list-like object representing
201 | 			the dataset, with each entry as a sample. Otherwise this will be a list of list-like
202 | 			objects, with each entry representing the results from a minibatch.
203 | 		"""
204 | 		if procs is None:  procs= self.procs
205 | 		if backend is None:  backend= self.backend
206 | 		if backend_handle is None:  backend_handle = self.backend_handle
207 | 		if task_num_cpus is None:  task_num_cpus = self.task_num_cpus
208 | 		if task_num_gpus is None:  task_num_gpus = self.task_num_gpus
209 | 		if verbose is None: verbose= self.verbose
210 | 		if verbose > 1:
211 | 			print("Task:", task, " backend:", backend, " backend_handle:", backend_handle, " procs:",
212 | 		      procs, " input_split:", input_split, " merge_output:", merge_output)
213 | 
214 | 		if verbose> 10:
215 | 			print("len(data):", len(data), "len(args):", len(args), "[type(x) for x in data]:",
216 | 				  [type(x) for x in data], "[type(x) for x in args]:", [type(x) for x in args])
217 | 
218 | 		if not(input_split):
219 | 			if backend=="spark":
220 | 				paral_params= self.split_batches(data, minibatch_size, backend="spark")
221 | 			else:
222 | 				paral_params= [[data_batch]+ args for data_batch in self.split_batches(data, minibatch_size)]
223 | 		else:
224 | 			if backend!="spark":  paral_params= [[data_batch]+ args for data_batch in data]
225 | 			else:  paral_params= data
226 | 		if verbose > 1:  print("Start task, len(paral_params)", len(paral_params))
227 | 		if backend == "serial":
228 | 			results = [task(minibatch) for minibatch in paral_params]
229 | 		else:
230 | 			if backend=="multiprocessing":
231 | 				with closing(multiprocessing.Pool(max(1, procs), maxtasksperchild=2)) as pool:
232 | 					results = pool.map_async(task, paral_params)
233 | 					pool.close()
234 | 					pool.join()
235 | 					results= results.get()
236 | 			elif backend=="threading":
237 | 				with closing(multiprocessing.dummy.Pool(max(1,procs))) as pool:
238 | 					results= pool.map(task, paral_params)
239 | 					pool.close()
240 | 					pool.join()
241 | 			if backend=="loky":
242 | 				from loky import get_reusable_executor
243 | 				pool= get_reusable_executor(max_workers=max(1, procs))
244 | 				results= list(pool.map(task, paral_params))
245 | 			elif backend == "dask":
246 | 				###if not (input_split):  data= self.scatter(data)
247 | 				results = [self.backend_handle.submit(task, params) for params in paral_params]
248 | 			elif backend == "spark":
249 | 				def apply_func_to_indexedrdd(batch):
250 | 					return [batch[0]] + [task([batch[1]] + args)]
251 | 				results = paral_params.map(apply_func_to_indexedrdd)
252 | 			elif backend == "ray":
253 | 				@self.backend_handle.remote(num_cpus=task_num_cpus, num_gpus=task_num_gpus)
254 | 				def f_ray(f, data):
255 | 					return f(data)
256 | 				results = [f_ray.remote(task, paral_params.pop(0)) for _ in range(min(len(paral_params), self.procs))]
257 | 				uncompleted = results
258 | 				while (len(paral_params) > 0):
259 | 					# More tasks than available processors. Queue the task calls
260 | 					done, remaining = self.backend_handle.wait(uncompleted, timeout=60, fetch_local=False)
261 | 					if len(done) == 0: continue
262 | 					done= done[0]
263 | 					uncompleted = [x for x in uncompleted if x != done]
264 | 					if len(remaining) > 0:
265 | 						new = f_ray.remote(task, paral_params.pop(0))
266 | 						uncompleted.append(new)
267 | 						results.append(new)
268 | 				results = [self.backend_handle.get(x) for x in results]
269 | 			#ppft currently not supported. Supporting arbitrary tasks requires modifications to passed arguments
270 | 			#elif backend == "ppft":
271 | 			#   jobs = [self.backend_handle.submit(task, (x,), (), ()) for x in paral_params]
272 | 			#	results = [x() for x in jobs]
273 | 
274 | 		if merge_output:  return self.merge_batches(self.collect_batches(results, backend=backend))
275 | 		if verbose > 2:
276 | 			print("Task:", task, " backend:", backend, " backend_handle:", backend_handle, " completed")
277 | 		return results
278 | 
279 | 	def shuffle_batch(self, texts, labels= None, seed= None):
280 | 		"""Shuffle a list of samples, as well as the labels if specified
281 | 
282 | 		Parameters
283 | 		----------
284 | 		texts: list-like
285 | 			List of samples to shuffle
286 | 
287 | 		labels: list-like (optional)
288 | 			List of labels to shuffle, should be correspondent to the samples given
289 | 
290 | 		seed: int
291 | 			The seed of the pseudo random number generator to use for shuffling
292 | 
293 | 		Returns
294 | 		-------
295 | 		texts: list
296 | 			List of shuffled samples (texts parameters)
297 | 
298 | 		labels: list (optional)
299 | 			List of shuffled labels. This will only be returned when non-None
300 | 			labels is passed
301 | 		"""
302 | 		if seed != None:  random.seed(seed)
303 | 		index_shuf= list(range(len(texts)))
304 | 		random.shuffle(index_shuf)
305 | 		texts= [texts[x] for x in index_shuf]
306 | 		if labels == None:  return texts
307 | 		labels= [labels[x] for x in index_shuf]
308 | 		return texts, labels
309 | 
310 | 	def __getstate__(self):
311 | 		return dict((k, v) for (k, v) in self.__dict__.items())
312 | 
313 | 	def __setstate__(self, params):
314 | 		for key in params:  setattr(self, key, params[key])
315 | 


--------------------------------------------------------------------------------
/wordbatch/data_utils.py:
--------------------------------------------------------------------------------
 1 | import randomgen
 2 | import numpy as np
 3 | import time
 4 | import multiprocessing
 5 | from contextlib import contextmanager
 6 | from functools import partial
 7 | from multiprocessing.pool import ThreadPool
 8 | import itertools
 9 | import scipy.sparse as ssp
10 | 
11 | # @contextmanager
12 | # def timer(name):
13 | #     t0 = time.time()
14 | #     yield
15 | #     print(name + " done in " + str(time.time() - t0) + "s")
16 | #
17 | # def shuffle(*objects, seed=0):
18 | #     #Faster than inplace, but uses more memory
19 | #     if isinstance(objects[0], ssp.base.spmatrix):  lenn= objects[0].shape[0]
20 | #     else: lenn= len(objects[0])
21 | #     shuffled= randomgen.xoroshiro128.Xoroshiro128(seed).generator.permutation(lenn)
22 | #     return [[x[z] for z in shuffled] if type(x)==list else x[shuffled] for x in objects]
23 | #
24 | # def inplace_shuffle(*objects, seed=0):
25 | #     #Slower than shuffle, but uses no extra memory
26 | #     rand = randomgen.xoroshiro128.Xoroshiro128(seed).generator
27 | #     for x in objects:
28 | #         rand.seed(seed)
29 | #         rand.shuffle(x)
30 | #
31 | # def inplace_shuffle_threaded(*objects, threads= 0, seed=0):
32 | #     #Faster than inplace for very large array sizes, > 10000000
33 | #     if threads== 0:  threads= min(len(objects), multiprocessing.cpu_count())
34 | #     with ThreadPool(processes=threads) as pool:
35 | #         pool.map(partial(inplace_shuffle, seed=seed), objects)
36 | 
37 | def indlist2csrmatrix(indlist, datalist= None, shape= None):
38 |     #Convert a list of indicator lists to a scipy.sparse.csr_matrix
39 |     indptr= [0]
40 |     c= 0
41 |     for x in indlist:
42 |         c+= len(x)
43 |         indptr.append(c)
44 |     indices = list(itertools.chain.from_iterable(indlist))
45 |     if datalist is not None:
46 |         data= list(itertools.chain.from_iterable(datalist))
47 |     else:
48 |         data= np.ones(len(indices), dtype=np.float64)
49 |     if shape==None:  shape= (len(indlist), max(indices)+1)
50 |     X= ssp.csr_matrix((data, indices, indptr), shape=shape)
51 |     return X
52 | 
53 | # x= np.array(range(10000000))
54 | # y= np.array(range(10000000))
55 | #
56 | # print(x)
57 | # print(y)
58 | #
59 | # with timer('shuffle'):
60 | #     for z in range(10):
61 | #         x, y= shuffle(x,y)
62 | # print(x)
63 | # print(y)
64 | #
65 | # with timer('inplace_shuffle'):
66 | #     for z in range(10):
67 | #         inplace_shuffle(x,y)
68 | # print(x)
69 | # print(y)
70 | #
71 | # with timer('inplace_shuffle_threaded'):
72 | #     for z in range(10):
73 | #         inplace_shuffle_threaded(x,y)
74 | # print(x)
75 | # print(y)
76 | #
77 | # from sklearn.utils import shuffle as shuffle2
78 | # with timer('sklearn_shuffle'):
79 | #     for z in range(10):
80 | #         x, y= shuffle2(x,y)
81 | # print(x)
82 | # print(y)
83 | 


--------------------------------------------------------------------------------
/wordbatch/extractors/MurmurHash3.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
  3 | // domain. The author hereby disclaims copyright to this source code.
  4 | 
  5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the
  6 | // algorithms are optimized for their respective platforms. You can still
  7 | // compile and run any of them on any platform, but your performance with the
  8 | // non-native version will be less than optimal.
  9 | 
 10 | #include "MurmurHash3.h"
 11 | 
 12 | //-----------------------------------------------------------------------------
 13 | // Platform-specific functions and macros
 14 | 
 15 | // Microsoft Visual Studio
 16 | 
 17 | #if defined(_MSC_VER)
 18 | 
 19 | #define FORCE_INLINE	__forceinline
 20 | 
 21 | #include <stdlib.h>
 22 | 
 23 | #define ROTL32(x,y)	_rotl(x,y)
 24 | #define ROTL64(x,y)	_rotl64(x,y)
 25 | 
 26 | #define BIG_CONSTANT(x) (x)
 27 | 
 28 | // Other compilers
 29 | 
 30 | #else	// defined(_MSC_VER)
 31 | 
 32 | #if defined(GNUC) && ((GNUC > 4) || (GNUC == 4 && GNUC_MINOR >= 4))
 33 | 
 34 | /* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6.
 35 |  * Don't inline for RHEL 5 gcc which is 4.1 */
 36 | #define FORCE_INLINE attribute((always_inline))
 37 | 
 38 | #else
 39 | 
 40 | #define FORCE_INLINE
 41 | 
 42 | #endif
 43 | 
 44 | 
 45 | inline uint32_t rotl32 ( uint32_t x, int8_t r )
 46 | {
 47 |   return (x << r) | (x >> (32 - r));
 48 | }
 49 | 
 50 | inline uint64_t rotl64 ( uint64_t x, int8_t r )
 51 | {
 52 |   return (x << r) | (x >> (64 - r));
 53 | }
 54 | 
 55 | #define	ROTL32(x,y)	rotl32(x,y)
 56 | #define ROTL64(x,y)	rotl64(x,y)
 57 | 
 58 | #define BIG_CONSTANT(x) (x##LLU)
 59 | 
 60 | #endif // !defined(_MSC_VER)
 61 | 
 62 | //-----------------------------------------------------------------------------
 63 | // Block read - if your platform needs to do endian-swapping or can only
 64 | // handle aligned reads, do the conversion here
 65 | 
 66 | FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
 67 | {
 68 |   return p[i];
 69 | }
 70 | 
 71 | FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
 72 | {
 73 |   return p[i];
 74 | }
 75 | 
 76 | //-----------------------------------------------------------------------------
 77 | // Finalization mix - force all bits of a hash block to avalanche
 78 | 
 79 | FORCE_INLINE uint32_t fmix ( uint32_t h )
 80 | {
 81 |   h ^= h >> 16;
 82 |   h *= 0x85ebca6b;
 83 |   h ^= h >> 13;
 84 |   h *= 0xc2b2ae35;
 85 |   h ^= h >> 16;
 86 | 
 87 |   return h;
 88 | }
 89 | 
 90 | //----------
 91 | 
 92 | FORCE_INLINE uint64_t fmix ( uint64_t k )
 93 | {
 94 |   k ^= k >> 33;
 95 |   k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 96 |   k ^= k >> 33;
 97 |   k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 98 |   k ^= k >> 33;
 99 | 
100 |   return k;
101 | }
102 | 
103 | //-----------------------------------------------------------------------------
104 | 
105 | void MurmurHash3_x86_32 ( const void * key, int len,
106 |                           uint32_t seed, void * out )
107 | {
108 |   const uint8_t * data = (const uint8_t*)key;
109 |   const int nblocks = len / 4;
110 | 
111 |   uint32_t h1 = seed;
112 | 
113 |   uint32_t c1 = 0xcc9e2d51;
114 |   uint32_t c2 = 0x1b873593;
115 | 
116 |   //----------
117 |   // body
118 | 
119 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
120 | 
121 |   for(int i = -nblocks; i; i++)
122 |   {
123 |     uint32_t k1 = getblock(blocks,i);
124 | 
125 |     k1 *= c1;
126 |     k1 = ROTL32(k1,15);
127 |     k1 *= c2;
128 | 
129 |     h1 ^= k1;
130 |     h1 = ROTL32(h1,13);
131 |     h1 = h1*5+0xe6546b64;
132 |   }
133 | 
134 |   //----------
135 |   // tail
136 | 
137 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
138 | 
139 |   uint32_t k1 = 0;
140 | 
141 |   switch(len & 3)
142 |   {
143 |   case 3: k1 ^= tail[2] << 16;
144 |   case 2: k1 ^= tail[1] << 8;
145 |   case 1: k1 ^= tail[0];
146 |           k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
147 |   };
148 | 
149 |   //----------
150 |   // finalization
151 | 
152 |   h1 ^= len;
153 | 
154 |   h1 = fmix(h1);
155 | 
156 |   *(uint32_t*)out = h1;
157 | }
158 | 
159 | //-----------------------------------------------------------------------------
160 | 
161 | void MurmurHash3_x86_128 ( const void * key, const int len,
162 |                            uint32_t seed, void * out )
163 | {
164 |   const uint8_t * data = (const uint8_t*)key;
165 |   const int nblocks = len / 16;
166 | 
167 |   uint32_t h1 = seed;
168 |   uint32_t h2 = seed;
169 |   uint32_t h3 = seed;
170 |   uint32_t h4 = seed;
171 | 
172 |   uint32_t c1 = 0x239b961b;
173 |   uint32_t c2 = 0xab0e9789;
174 |   uint32_t c3 = 0x38b34ae5;
175 |   uint32_t c4 = 0xa1e38b93;
176 | 
177 |   //----------
178 |   // body
179 | 
180 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
181 | 
182 |   for(int i = -nblocks; i; i++)
183 |   {
184 |     uint32_t k1 = getblock(blocks,i*4+0);
185 |     uint32_t k2 = getblock(blocks,i*4+1);
186 |     uint32_t k3 = getblock(blocks,i*4+2);
187 |     uint32_t k4 = getblock(blocks,i*4+3);
188 | 
189 |     k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
190 | 
191 |     h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
192 | 
193 |     k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
194 | 
195 |     h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
196 | 
197 |     k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
198 | 
199 |     h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
200 | 
201 |     k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
202 | 
203 |     h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
204 |   }
205 | 
206 |   //----------
207 |   // tail
208 | 
209 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
210 | 
211 |   uint32_t k1 = 0;
212 |   uint32_t k2 = 0;
213 |   uint32_t k3 = 0;
214 |   uint32_t k4 = 0;
215 | 
216 |   switch(len & 15)
217 |   {
218 |   case 15: k4 ^= tail[14] << 16;
219 |   case 14: k4 ^= tail[13] << 8;
220 |   case 13: k4 ^= tail[12] << 0;
221 |            k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
222 | 
223 |   case 12: k3 ^= tail[11] << 24;
224 |   case 11: k3 ^= tail[10] << 16;
225 |   case 10: k3 ^= tail[ 9] << 8;
226 |   case  9: k3 ^= tail[ 8] << 0;
227 |            k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
228 | 
229 |   case  8: k2 ^= tail[ 7] << 24;
230 |   case  7: k2 ^= tail[ 6] << 16;
231 |   case  6: k2 ^= tail[ 5] << 8;
232 |   case  5: k2 ^= tail[ 4] << 0;
233 |            k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
234 | 
235 |   case  4: k1 ^= tail[ 3] << 24;
236 |   case  3: k1 ^= tail[ 2] << 16;
237 |   case  2: k1 ^= tail[ 1] << 8;
238 |   case  1: k1 ^= tail[ 0] << 0;
239 |            k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
240 |   };
241 | 
242 |   //----------
243 |   // finalization
244 | 
245 |   h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
246 | 
247 |   h1 += h2; h1 += h3; h1 += h4;
248 |   h2 += h1; h3 += h1; h4 += h1;
249 | 
250 |   h1 = fmix(h1);
251 |   h2 = fmix(h2);
252 |   h3 = fmix(h3);
253 |   h4 = fmix(h4);
254 | 
255 |   h1 += h2; h1 += h3; h1 += h4;
256 |   h2 += h1; h3 += h1; h4 += h1;
257 | 
258 |   ((uint32_t*)out)[0] = h1;
259 |   ((uint32_t*)out)[1] = h2;
260 |   ((uint32_t*)out)[2] = h3;
261 |   ((uint32_t*)out)[3] = h4;
262 | }
263 | 
264 | //-----------------------------------------------------------------------------
265 | 
266 | void MurmurHash3_x64_128 ( const void * key, const int len,
267 |                            const uint32_t seed, void * out )
268 | {
269 |   const uint8_t * data = (const uint8_t*)key;
270 |   const int nblocks = len / 16;
271 | 
272 |   uint64_t h1 = seed;
273 |   uint64_t h2 = seed;
274 | 
275 |   uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
276 |   uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
277 | 
278 |   //----------
279 |   // body
280 | 
281 |   const uint64_t * blocks = (const uint64_t *)(data);
282 | 
283 |   for(int i = 0; i < nblocks; i++)
284 |   {
285 |     uint64_t k1 = getblock(blocks,i*2+0);
286 |     uint64_t k2 = getblock(blocks,i*2+1);
287 | 
288 |     k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
289 | 
290 |     h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
291 | 
292 |     k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
293 | 
294 |     h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
295 |   }
296 | 
297 |   //----------
298 |   // tail
299 | 
300 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
301 | 
302 |   uint64_t k1 = 0;
303 |   uint64_t k2 = 0;
304 | 
305 |   switch(len & 15)
306 |   {
307 |   case 15: k2 ^= uint64_t(tail[14]) << 48;
308 |   case 14: k2 ^= uint64_t(tail[13]) << 40;
309 |   case 13: k2 ^= uint64_t(tail[12]) << 32;
310 |   case 12: k2 ^= uint64_t(tail[11]) << 24;
311 |   case 11: k2 ^= uint64_t(tail[10]) << 16;
312 |   case 10: k2 ^= uint64_t(tail[ 9]) << 8;
313 |   case  9: k2 ^= uint64_t(tail[ 8]) << 0;
314 |            k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
315 | 
316 |   case  8: k1 ^= uint64_t(tail[ 7]) << 56;
317 |   case  7: k1 ^= uint64_t(tail[ 6]) << 48;
318 |   case  6: k1 ^= uint64_t(tail[ 5]) << 40;
319 |   case  5: k1 ^= uint64_t(tail[ 4]) << 32;
320 |   case  4: k1 ^= uint64_t(tail[ 3]) << 24;
321 |   case  3: k1 ^= uint64_t(tail[ 2]) << 16;
322 |   case  2: k1 ^= uint64_t(tail[ 1]) << 8;
323 |   case  1: k1 ^= uint64_t(tail[ 0]) << 0;
324 |            k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
325 |   };
326 | 
327 |   //----------
328 |   // finalization
329 | 
330 |   h1 ^= len; h2 ^= len;
331 | 
332 |   h1 += h2;
333 |   h2 += h1;
334 | 
335 |   h1 = fmix(h1);
336 |   h2 = fmix(h2);
337 | 
338 |   h1 += h2;
339 |   h2 += h1;
340 | 
341 |   ((uint64_t*)out)[0] = h1;
342 |   ((uint64_t*)out)[1] = h2;
343 | }
344 | 
345 | //-----------------------------------------------------------------------------
346 | 
347 | 


--------------------------------------------------------------------------------
/wordbatch/extractors/MurmurHash3.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 3 | // domain. The author hereby disclaims copyright to this source code.
 4 | 
 5 | #ifndef _MURMURHASH3_H_
 6 | #define _MURMURHASH3_H_
 7 | 
 8 | //-----------------------------------------------------------------------------
 9 | // Platform-specific functions and macros
10 | 
11 | // Microsoft Visual Studio
12 | 
13 | #if defined(_MSC_VER)
14 | 
15 | typedef unsigned char uint8_t;
16 | typedef unsigned long uint32_t;
17 | typedef unsigned __int64 uint64_t;
18 | 
19 | // Other compilers
20 | 
21 | #else	// defined(_MSC_VER)
22 | 
23 | #include <stdint.h>
24 | 
25 | #endif // !defined(_MSC_VER)
26 | 
27 | //-----------------------------------------------------------------------------
28 | #ifdef __cplusplus
29 | extern "C" {
30 | #endif
31 | 
32 | 
33 | void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
34 | 
35 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
36 | 
37 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
38 | 
39 | #ifdef __cplusplus
40 | }
41 | #endif
42 | 
43 | //-----------------------------------------------------------------------------
44 | 
45 | #endif // _MURMURHASH3_H_
46 | 


--------------------------------------------------------------------------------
/wordbatch/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .extractors import *
2 | 


--------------------------------------------------------------------------------
/wordbatch/extractors/extractors.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: boundscheck=False, infer_types=True, wraparound=False, cdivision=True
  3 | from __future__ import with_statement
  4 | from __future__ import division
  5 | from __future__ import absolute_import
  6 | from __future__ import print_function
  7 | from sklearn.utils.murmurhash import murmurhash3_32
  8 | from sklearn.feature_extraction.text import HashingVectorizer
  9 | #from nltk.metrics import edit_distance
 10 | import scipy.sparse as ssp
 11 | import scipy as sp
 12 | import numpy as np
 13 | import gzip
 14 | import lz4framed
 15 | import array
 16 | from wordbatch.data_utils import indlist2csrmatrix
 17 | from cpython cimport array
 18 | cimport cython
 19 | from libc.stdlib cimport abs
 20 | from libc.math cimport log, fabs
 21 | cimport numpy as np
 22 | 
 23 | np.import_array()
 24 | 
 25 | cdef extern:
 26 | 	void MurmurHash3_x86_32(void *key, int len, np.uint32_t seed, void *out)
 27 | 
 28 | cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed= 0):
 29 | 	cdef np.int32_t out
 30 | 	MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
 31 | 	return out
 32 | 
 33 | def save_to_lz4(file, input, dtype, level= 0):
 34 | 	with open(file, 'wb') as f:  f.write(lz4framed.compress(np.array(input, dtype=dtype).tostring(), level))
 35 | 
 36 | def load_from_lz4(file, dtype):
 37 | 	with open(file, 'rb') as f:  input= np.fromstring(lz4framed.decompress(f.read()), dtype=dtype)
 38 | 	return input
 39 | 
 40 | def csr_to_lz4(file, features):
 41 | 	save_to_lz4(file, features.indptr, dtype=int)
 42 | 	save_to_lz4(file+".i", features.indices, dtype=int)
 43 | 	save_to_lz4(file+".d", features.data, dtype=np.float64)
 44 | 
 45 | def lz4_to_csr(file):
 46 | 	indptr= load_from_lz4(file, int)
 47 | 	indices= load_from_lz4(file+".i", int)
 48 | 	data= load_from_lz4(file+".d", np.float64)
 49 | 	return ssp.csr_matrix((data, indices, indptr))
 50 | 
 51 | def batch_transform(args):
 52 | 	return args[1].batch_transform(args[0])
 53 | 
 54 | cdef class TextRow:
 55 | 	cdef list indices, data
 56 | 	cdef dict fea_weights
 57 | 
 58 | 	def __init__(self):
 59 | 		self.indices= []
 60 | 		self.data= []
 61 | 		self.fea_weights= {}
 62 | 
 63 | 	cdef append(self, int index, int value, float weight):
 64 | 		self.indices.append(index)
 65 | 		self.data.append(value)
 66 | 		self.fea_weights[index]= weight
 67 | 
 68 | class WordBag:
 69 | 	def __init__(self, *args, **kwargs):
 70 | 		self.dictionary= kwargs.get('dictionary', None)
 71 | 		kwargs.setdefault("norm", 'l2')
 72 | 		kwargs.setdefault("tf", 'log')
 73 | 		kwargs.setdefault("idf", 0.0)
 74 | 		kwargs.setdefault("hash_ngrams", 0)
 75 | 		kwargs.setdefault("hash_ngrams_weights", None)
 76 | 		kwargs.setdefault("hash_size", 10000000)
 77 | 		kwargs.setdefault("hash_polys_window", 0)
 78 | 		kwargs.setdefault("hash_polys_mindf", 5)
 79 | 		kwargs.setdefault("hash_polys_maxdf", 0.5)
 80 | 		kwargs.setdefault("hash_polys_weight", 0.1)
 81 | 		kwargs.setdefault("seed", 0)
 82 | 		for key, value in kwargs.items():  setattr(self, key.lower(), value)
 83 | 		if self.hash_ngrams_weights is None: self.hash_ngrams_weights= [1.0 for _ in range(self.hash_ngrams)]
 84 | 
 85 | 	def transform_single(self, text):
 86 | 		dft= self.dictionary.dft
 87 | 		word2id= self.dictionary.word2id
 88 | 		cdef int fc_hash_ngrams= self.hash_ngrams, word_id, df= 1, df2, hashed, doc_count= self.dictionary.doc_count, \
 89 | 									use_idf= 0, seed= self.seed
 90 | 		cdef float idf_lift= 0.0, idf= 1.0, weight, norm= 1.0, norm_idf= 1.0
 91 | 		if self.idf is not None:
 92 | 			use_idf= True
 93 | 			idf_lift= self.idf
 94 | 			norm_idf= 1.0 / log(max(1.0, idf_lift + doc_count))
 95 | 		cdef int fc_hash_size= self.hash_size
 96 | 		if self.hash_ngrams == 0: hash_size = self.dictionary.max_words
 97 | 		fc_hash_ngrams_weights= self.hash_ngrams_weights
 98 | 		fc_tf= self.tf
 99 | 		fc_norm= self.norm
100 | 		cdef int fc_hash_polys_window= self.hash_polys_window, fc_hash_polys_mindf= self.hash_polys_mindf
101 | 		cdef float fc_hash_polys_maxdf= self.hash_polys_maxdf, fc_hash_polys_weight= self.hash_polys_weight
102 | 
103 | 		text= text.split(" ")
104 | 		cdef TextRow textrow= TextRow()
105 | 		for x in range(len(text)):
106 | 			word= text[x]
107 | 			if word2id is not None:
108 | 				word_id = word2id.get(word, -1)
109 | 				if word_id == -1 or word_id>= fc_hash_size:  continue
110 | 			df= dft.get(word, 0)
111 | 			if use_idf:
112 | 				if df == 0:  continue
113 | 				idf= log(max(1.0, idf_lift + doc_count / df)) * norm_idf
114 | 				#print(word, idf, df, log(max(1.0, idf_lift + doc_count / df)), norm_idf)
115 | 				if idf== 0.0:  continue
116 | 
117 | 			if fc_hash_ngrams==0:  textrow.append(word_id, 1, idf)
118 | 
119 | 			for y in range(min(fc_hash_ngrams, x+1)):
120 | 				hashed= murmurhash3_bytes_s32((" ".join(text[x-y:x+1])).encode("utf-8"), seed)
121 | 				weight= fc_hash_ngrams_weights[y]
122 | 				if weight < 0: weight*= -idf
123 | 				textrow.append(abs(hashed) % fc_hash_size, (hashed >= 0) * 2 - 1, weight)
124 | 
125 | 			if fc_hash_polys_window!=0:
126 | 				if doc_count!=0:
127 | 					if df< fc_hash_polys_mindf or float(df)/self.dictionary.doc_count> fc_hash_polys_maxdf:  continue
128 | 				#for y from max(1, fc_hash_ngrams) <= y < min(fc_hash_polys_window, x+1):
129 | 				for y in range(1, min(fc_hash_polys_window, x+1)):
130 | 					word2= text[x-y]
131 | 					if doc_count!=0:
132 | 						df2= dft[word2]
133 | 						if df2< fc_hash_polys_mindf or float(df2)/self.dictionary.doc_count> fc_hash_polys_maxdf:
134 | 							continue
135 | 					hashed= murmurhash3_bytes_s32((word+"#"+word2).encode("utf-8"), seed) if word<word2 \
136 | 						else murmurhash3_bytes_s32((word2+"#"+word).encode("utf-8"), seed)
137 | 					weight= fc_hash_polys_weight
138 | 					#if weight<0.0: weight= np.abs(weight) * 1.0/np.log(1+y)
139 | 					if weight < 0.0: weight= fabs(weight) * 1.0 / log(1 + y)
140 | 					#print word, word2, df, df2, x, y, abs(hashed) % fc_hash_size, (hashed >= 0) * 2 - 1, weight
141 | 					textrow.append(abs(hashed) % fc_hash_size, (hashed >= 0) * 2 - 1, weight)
142 | 
143 | 		cdef np.int32_t size= len(textrow.data)
144 | 		wordbag= ssp.csr_matrix((textrow.data, textrow.indices, array.array("i", ([0, size]))),
145 | 								 shape=(1, fc_hash_size), dtype=np.float64)
146 | 		wordbag.sum_duplicates()
147 | 
148 | 		if fc_tf== 'log':  wordbag.data= np.log(1.0+np.abs(wordbag.data)) *np.sign(wordbag.data)
149 | 		elif fc_tf== 'binary':  np.sign(wordbag.data, out=wordbag.data)
150 | 		elif type(fc_tf)== type(1.0):
151 | 			wordbag.data= ((fc_tf+1.0)*np.abs(wordbag.data))/(fc_tf+np.abs(wordbag.data))*np.sign(wordbag.data)
152 | 
153 | 		size= wordbag.data.shape[0]
154 | 		fea_weights= textrow.fea_weights
155 | 		cdef int [:] indices_view= wordbag.indices
156 | 		cdef double [:] data_view= wordbag.data
157 | 
158 | 		for x in range(size): data_view[x]*= fea_weights[indices_view[x]]
159 | 
160 | 		if fc_norm== 'l0':  norm= size
161 | 		elif fc_norm== 'l1':  norm= np.sum(np.abs(data_view))
162 | 		elif fc_norm== 'l2':  norm= np.sqrt(np.sum([w*w for w in data_view]))
163 | 		if norm != 0.0:  norm= 1.0 / norm
164 | 		if fc_norm is not None:  wordbag.data*= norm
165 | 		return wordbag
166 | 
167 | 	def transform(self, texts, y= None):#input_split= False, merge_output= True, batcher= None):
168 | 		return ssp.vstack([self.transform_single(text) for text in texts])
169 | 
170 | 	def fit(self, texts, y= None):
171 | 		return self
172 | 
173 | 	def fit_transform(self, texts, y=None):
174 | 		return self.transform(texts, y=None)
175 | 
176 | 	def save_features(self, file, features):
177 | 		csr_to_lz4(file, features)
178 | 
179 | 	def load_features(self, file):
180 | 		return lz4_to_csr(file)
181 | 
182 | class WordHash:
183 | 	def __init__(self, *args, **kwargs):
184 | 		if "dictionary" in kwargs:  kwargs.pop("dictionary")
185 | 		self.hv = HashingVectorizer(*args, **kwargs)
186 | 
187 | 	def transform(self, texts, y=None):
188 | 		return self.hv.transform(texts)
189 | 
190 | 	def fit(self, texts, y=None):
191 | 		return self
192 | 
193 | 	def fit_transform(self, texts, y=None):
194 | 		return self.transform(texts, y=None)
195 | 
196 | 	def save_features(self, file, features):
197 | 		csr_to_lz4(file, features)
198 | 
199 | 	def load_features(self, file):
200 | 		return lz4_to_csr(file)
201 | 
202 | 
203 | class WordSeq:
204 | 	def __init__(self, *args, **kwargs):
205 | 		self.dictionary = kwargs.get('dictionary', None)
206 | 		kwargs.setdefault("seq_maxlen", None)
207 | 		kwargs.setdefault("seq_padstart", True)
208 | 		kwargs.setdefault("seq_truncstart", True)
209 | 		kwargs.setdefault("remove_oovs", False)
210 | 		kwargs.setdefault("pad_id", 0)
211 | 		for key, value in kwargs.items():  setattr(self, key.lower(), value)
212 | 
213 | 	def transform_single(self, text):
214 | 		word2id= self.dictionary.word2id
215 | 		oov_id= self.dictionary.max_words+1
216 | 		if self.remove_oovs:  wordseq= [word2id[word] for word in text.split(" ") if word in word2id]
217 | 		else:  wordseq= [word2id.get(word, oov_id) for word in text.split(" ")]
218 | 		if self.seq_maxlen is not None:
219 | 			if len(wordseq) > self.seq_maxlen:
220 | 				if self.seq_truncstart:  wordseq= wordseq[-self.seq_maxlen:]
221 | 				else:  wordseq= wordseq[:self.seq_maxlen]
222 | 			else:
223 | 				if self.seq_padstart== True:  wordseq= [self.pad_id] * (self.seq_maxlen - len(wordseq)) + wordseq
224 | 				else:  wordseq+= [self.pad_id] * (self.seq_maxlen - len(wordseq))
225 | 		return wordseq
226 | 
227 | 	def transform(self, texts, y= None):
228 | 		return [self.transform_single(text) for text in texts]
229 | 
230 | 	def fit(self, texts, y=None):
231 | 		return self
232 | 
233 | 	def fit_transform(self, texts, y=None):
234 | 		return self.transform(texts, y=None)
235 | 
236 | 	def save_features(self, file, features):
237 | 		save_to_lz4(file, features, dtype=int)
238 | 		i= 0
239 | 		indices= []
240 | 		for x in features:
241 | 			i+= len(x)
242 | 			indices.append(i)
243 | 		save_to_lz4(file + ".i", indices, dtype=int)
244 | 
245 | 	def load_features(self, file):
246 | 		words= load_from_lz4(file, int).tolist()
247 | 		indices= [0]+load_from_lz4(file + ".i", int).tolist()
248 | 		return [words[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
249 | 
250 | 
251 | class WordVec:
252 | 	def __init__(self, *args, **kwargs):
253 | 		self.dictionary = kwargs.get('dictionary', None)
254 | 		kwargs.setdefault("normalize_text", None)
255 | 		kwargs.setdefault("stemmer", None)
256 | 		kwargs.setdefault("merge_dict", True)
257 | 		kwargs.setdefault("normalize_dict", False)
258 | 		kwargs.setdefault("verbose", 0)
259 | 		kwargs.setdefault("merge_vectors", "mean")
260 | 		kwargs.setdefault("normalize_merged", "l2")
261 | 		kwargs.setdefault("encoding", "utf8")
262 | 		kwargs.setdefault("shrink_model_transform", True)
263 | 		kwargs.setdefault("w2v_dim", None)
264 | 		for key, value in kwargs.items():  setattr(self, key.lower(), value)
265 | 		if "w2v_model" in kwargs:  self.w2v= kwargs["w2v_model"]
266 | 		else:  self.w2v= self.load_w2v(kwargs["wordvec_file"], kwargs['encoding'], kwargs['w2v_dim'])
267 | 		self.w2v_dim= len(list(self.w2v.values())[0])
268 | 
269 | 	def load_w2v(self, w2v_file, encoding= "ISO-8859-1", w2v_dim= None):
270 | 		w2v= {}
271 | 		from collections import Counter
272 | 		w2v_counts= Counter()
273 | 		opn= gzip.open if w2v_file.endswith(".gz") else open
274 | 		for line in opn(w2v_file, 'rb'):
275 | 			line= line.decode(encoding).strip().split(" ", 1)
276 | 			vec= np.array([np.float64(x) for x in line[1].split(" ")])
277 | 			if len(vec)<2: continue
278 | 			if w2v_dim is not None and len(vec)!=w2v_dim:
279 | 				print("Wrong vector length", len(vec),", should be:", w2v_dim, ":", line)
280 | 				continue
281 | 			word= line[0]
282 | 			if self.normalize_text is not None:  word= self.normalize_text(word)
283 | 			if self.stemmer is not None:  word= self.stemmer.stem(word)
284 | 			if not(self.merge_dict):  w2v[word]= vec
285 | 			else:
286 | 				w2v_counts[word] += 1
287 | 				if word in w2v:
288 | 					w2v[word]+= (vec - w2v[word]) / w2v_counts[word]
289 | 					if self.verbose>0:
290 | 						print("Merged entry:", word, w2v_counts[word])
291 | 				else:  w2v[word]= vec
292 | 		if self.normalize_dict!=False:
293 | 			for word in w2v:
294 | 				if self.normalize_dict=="l1":
295 | 					norm= sum(np.abs(w2v[word]))
296 | 				else:
297 | 					norm = np.sqrt(sum(w2v[word] **2))
298 | 				if norm!=0:
299 | 					w2v[word]/= norm
300 | 		return w2v
301 | 
302 | 	def transform_single(self, text):
303 | 		text= text.split(" ")
304 | 		if len(text)==0:  return np.zeros(self.w2v_dim)
305 | 		w2v= self.w2v
306 | 		vecs= []
307 | 		for word in text:
308 | 			if word in w2v:  vecs.append(w2v[word])
309 | 			else:  vecs.append(np.zeros(self.w2v_dim))
310 | 		if self.merge_vectors is not None: #Merge word vectors to a per-document vector
311 | 			if self.merge_vectors=="mean": #Currently only mean vector suppported, could do max, median, etc.
312 | 				vec= np.mean(vecs, axis=0)
313 | 			if self.normalize_merged is not None: #l1 and l2 normalization supported
314 | 				if self.normalize_merged == "l1":
315 | 					norm = sum(np.abs(vec))
316 | 				else:
317 | 					norm = np.sqrt(sum(vec ** 2))
318 | 				if norm != 0:
319 | 					vec /= norm
320 | 			return vec
321 | 		return vecs
322 | 
323 | 	def transform(self, texts, y=None):
324 | 		#if batcher is None:  return batch_transform(texts)
325 | 		# if self.shrink_model_transform == True:
326 | 		# 	#Send only word vectors occurring in texts to parallel processes.
327 | 		# 	#Use to reduce memory footprint with big embedding files.
328 | 		# 	d= wordbatch.transformers.dictionary.Dictionary(verbose=0, encode=False).fit(texts, input_split=input_split)
329 | 		# 	w2v_model2= {x:self.w2v[x] for x in [z for z in self.w2v.keys() if z in d.dft]}
330 | 		# 	fea_cfg2= self.fea_cfg
331 | 		# 	fea_cfg2['w2v_model']= w2v_model2
332 | 		# 	self_shrunk= WordVec(dictionary=None, fea_cfg=fea_cfg2)
333 | 		# else:  self_shrunk= self
334 | 		#return batcher.process_batches(batch_transform, texts, [self], input_split=input_split,
335 | 		#								   merge_output=merge_output)
336 | 		return [self.transform_single(text) for text in texts]
337 | 
338 | 	def fit(self, texts, y=None):
339 | 		return self
340 | 
341 | 	def fit_transform(self, texts, y=None):
342 | 		return self.transform(texts, y)
343 | 
344 | 
345 | class Hstack:
346 | 	def __init__(self, extractors):
347 | 		self.extractors= extractors
348 | 
349 | 	def transform(self, texts, y= None):
350 | 		return sp.hstack([x.transform(texts) for x in self.extractors])
351 | 
352 | 	def fit(self, texts, y=None):
353 | 		return self
354 | 
355 | 	def fit_transform(self, texts, y=None):
356 | 		return self.transform(texts, y)
357 | 
358 | 
359 | class PandasHash:
360 | 	def __init__(self, *args, **kwargs):
361 | 		self.col_salt= None
362 | 		self.col_weight= None
363 | 		self.col_pick= []
364 | 		self.dtype_specific= False
365 | 		for key, value in kwargs.items():  setattr(self, key.lower(), value)
366 | 		if self.col_salt is None or len(self.col_salt) == 0:
367 | 			self.col_salt = ["".join([z[0] for z in x.replace(" ", "_").replace("|", "_").split("_")])
368 | 						for x in self.col_pick]
369 | 		if self.col_weight is None or len(self.col_weight) == 0:  self.col_weight = np.ones(len(self.col_pick))
370 | 
371 | 	def transform(self, df, y= None):
372 | 		D= self.n_features
373 | 		col_pick= self.col_pick
374 | 		col_salt= self.col_salt
375 | 		col_weight= self.col_weight
376 | 		if not (self.dtype_specific):
377 | 			return indlist2csrmatrix(
378 | 			#indlist=np.array([np.vectorize(lambda x: hash(x) % D)(df[col].astype(str)+y)
379 | 			#				  for col, y in zip(col_pick, col_salt)]).T,
380 | 			indlist= np.array([[murmurhash3_32(x + y) % D for x in df[col].astype(str)]
381 | 							   for col, y in zip(col_pick, col_salt)]).T,
382 | 								datalist= [col_weight] * len(df),
383 | 								shape= (len(df), D))
384 | 		return indlist2csrmatrix(
385 | 			indlist = np.array([
386 | 				[murmurhash3_32(y) % D] * len(df)
387 | 				if df[col].dtypes.name == 'bool' or np.issubdtype(df[col].dtypes, np.floating) else
388 | 				[murmurhash3_32(x + y) % D for x in df[col].astype(str)]
389 | 				for col, y in zip(col_pick, col_salt)]).T,
390 | 			datalist= col_weight * (np.array([
391 | 				df[col] if df[col].dtypes.name == 'bool' or np.issubdtype(df[col].dtypes, np.floating) else
392 | 				np.ones(len(df))
393 | 				for col, y in zip(col_pick, col_salt)]).T),
394 | 			shape = (len(df), D))
395 | 
396 | 	def fit(self, texts, y=None):
397 | 		return self
398 | 
399 | 	def fit_transform(self, texts, y=None):
400 | 		return self.transform(texts, y)
401 | 
402 | 
403 | class CategoricalEncoder:
404 | 	def __init__(self, *args, **kwargs):
405 | 		self.dictionary= kwargs.get('dictionary', None)
406 | 
407 | 	def transform(self, data, y= None):
408 | 		return [self.dictionary.word2id.get(x, self.dictionary.max_words+1) for x in data]
409 | 
410 | 	def fit(self, data, y=None):
411 | 		self.dictionary.prune_dictionary(re_encode=True, prune_dfs=False)
412 | 		return self
413 | 
414 | 	def fit_transform(self, data, y=None):
415 | 		self.fit(data)
416 | 		return self.transform(data, y)


--------------------------------------------------------------------------------
/wordbatch/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .ftrl import FTRL
2 | from .ftrl32 import FTRL32
3 | from .fm_ftrl import FM_FTRL
4 | from .nn_relu_h1 import NN_ReLU_H1
5 | from .nn_relu_h2 import NN_ReLU_H2
6 | 


--------------------------------------------------------------------------------
/wordbatch/models/avx_ext.c:
--------------------------------------------------------------------------------
  1 | #include "avx_ext.h"
  2 | 
  3 | #define USE_AVX2
  4 | #define USE_OMP
  5 | 
  6 | #ifdef USE_OMP
  7 | #include <omp.h>
  8 | #endif
  9 | 
 10 | #ifdef USE_AVX2
 11 | 
 12 | #include <immintrin.h>
 13 | 
 14 | #endif
 15 | 
 16 | //#include <malloc.h> //Deprecated
 17 | #include <stdlib.h>
 18 | #include <math.h>
 19 | 
 20 | double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta,
 21 |     double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm, int D_fm, 
 22 |     int bias_term, int n_threads) {
 23 |     double e = 0.0;
 24 |     double e2 = 0.0;
 25 |     if (bias_term)
 26 |         if (*z!=0.0) e += *w = -*z / ((beta + sqrt(*n)) * ialpha);
 27 |         else *w = 0.0;
 28 |     int k, ii;
 29 | 
 30 |     /*
 31 |     #ifdef USE_OMP
 32 |     #pragma omp parallel for
 33 |     #endif
 34 |     */
 35 |     for (ii = 0; ii < lenn; ii++) {
 36 |         const int i = inds[ii]+1;
 37 |         const double zi = z[i];
 38 |         const double sign = (zi < 0) ? -1.0 : 1.0;
 39 |         if (sign * zi > L1) {
 40 |             const double wi = (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2);
 41 |             w[i] = wi;
 42 |             e += wi * vals[ii];
 43 |         } else w[i] = 0.0;
 44 |     }
 45 | 
 46 |     int num_thread = 1;
 47 |     #ifdef USE_OMP
 48 |     if (n_threads <= 0) num_thread = omp_get_max_threads();
 49 |     else num_thread = n_threads;
 50 |     #endif
 51 | 
 52 |     double* acwfmk = (double*)malloc(sizeof(double) * D_fm * num_thread);
 53 |     #ifdef USE_OMP
 54 |     #pragma omp parallel for num_threads(n_threads) private(k)
 55 |     #endif
 56 |     for (k = 0; k < D_fm * num_thread; k++) acwfmk[k] = 0.0;
 57 | 
 58 |     double* wi2_acc = (double*)malloc(sizeof(double) * num_thread * 4);
 59 | 
 60 |     double wi2 = 0.0;
 61 |     #ifdef USE_OMP
 62 |     #pragma omp parallel for num_threads(num_thread) private(k)
 63 |     #endif
 64 |     for (k = 0; k < num_thread * 4; k++) wi2_acc[k] = 0.0;
 65 | 
 66 |     #ifdef USE_OMP
 67 |     #pragma omp parallel for num_threads(num_thread) private(ii)
 68 |     #endif
 69 |     for (ii = 0; ii < lenn; ii++) {
 70 | 
 71 |         #ifdef USE_OMP
 72 |         const int i_thread = omp_get_thread_num();
 73 |         #else
 74 |         const int i_thread = 0;
 75 |         #endif
 76 | 
 77 |         double* pAcwfmk = acwfmk + i_thread * D_fm;
 78 |         double* wi2_acck = wi2_acc + i_thread * 4;
 79 |         const int i = inds[ii]+1;
 80 |         double v = vals[ii];
 81 |         int k = 0;
 82 |         double* z_fmik = z_fm + (i-1) * D_fm;
 83 |         double* w_fmk = pAcwfmk;
 84 | 
 85 |         #ifdef USE_AVX2
 86 |         __m256d v256 = _mm256_set1_pd(v);
 87 |         __m256d w2_256 = _mm256_loadu_pd(wi2_acck);
 88 |         while (k + 3 < D_fm) {
 89 |             __m256d d = _mm256_mul_pd(_mm256_loadu_pd(z_fmik), v256);
 90 |             _mm256_storeu_pd(w_fmk, _mm256_add_pd(_mm256_loadu_pd(w_fmk), d));
 91 |             w2_256 = _mm256_add_pd(w2_256, _mm256_mul_pd(d, d));
 92 |             k += 4;
 93 |             z_fmik += 4;
 94 |             w_fmk += 4;
 95 |         }
 96 |         _mm256_storeu_pd(wi2_acck, w2_256);
 97 |         #endif
 98 | 
 99 |         // Tail end
100 |         double d;
101 |         while(k < D_fm) {
102 |             pAcwfmk[k++] += d = *z_fmik++ * v;
103 |             wi2 += d*d;
104 |         }
105 |     }
106 | 
107 |     for (k = 0; k < D_fm; k++) {
108 |         double wfmk = 0.0;
109 |         for (int i_thread = 0; i_thread < num_thread;) wfmk += acwfmk[i_thread++ * D_fm + k];
110 |         *w_fm++ = wfmk;
111 |         e2 += wfmk* wfmk;
112 |     }
113 | 
114 |     for (k = 0; k < num_thread * 4;) wi2 += wi2_acc[k++];
115 | 
116 |     free(acwfmk);
117 |     free(wi2_acc);
118 |     e2 = (e2 - wi2) * 0.5 * weight_fm;
119 |     return e + e2;
120 | }
121 | 
122 | void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w, double* z,
123 |     double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm, double* n_fm, int D_fm, int bias_term,
124 |     int n_threads) {
125 | 
126 |     #ifdef USE_OMP
127 |     int num_thread;
128 |     if (n_threads <= 0) num_thread = omp_get_max_threads();
129 |     else num_thread = n_threads;
130 |     #endif
131 | 
132 |     const double e_sq = e * e;
133 | 
134 |     if (bias_term) {
135 |         *z += e - ((sqrt(*n + e_sq) - sqrt(*n)) * ialpha) * *w;
136 |         *n += e_sq;
137 |     }
138 |     const double L2_fme = L2_fm / e;
139 | 
140 |     int ii;
141 |     #ifdef USE_OMP
142 |     #pragma omp parallel for num_threads(num_thread) private(ii)
143 |     #endif
144 |     for (ii = 0; ii < lenn; ii++) {
145 |         const int i = inds[ii]+1;
146 |         const double v = vals[ii];
147 |         const double g = e * v;
148 |         const double g2 = g * g;
149 |         const double ni = n[i];
150 | 
151 |         z[i] += g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i];
152 |         n[i] += g2;
153 | 
154 |         double* z_fmik = z_fm + (i-1) * D_fm;
155 |         double* w_fmk = w_fm;
156 |         const double lr = g* alpha_fm / (sqrt(n_fm[i]) + 1.0);
157 |         const double reg = v - L2_fme;
158 | 
159 |         int k = 0;
160 |         #ifdef USE_AVX2
161 |         __m256d reg2 = _mm256_set1_pd(reg);
162 |         __m256d lr2 = _mm256_set1_pd(lr);
163 |         while (k + 3 < D_fm) {
164 |             __m256d z0 = _mm256_loadu_pd(z_fmik);
165 |             _mm256_storeu_pd(z_fmik,
166 |                 _mm256_sub_pd(z0, _mm256_mul_pd(lr2,
167 |                              _mm256_sub_pd(_mm256_loadu_pd(w_fmk),
168 |                              _mm256_mul_pd(z0, reg2)))));
169 |             w_fmk+= 4;
170 |             z_fmik+= 4;
171 |             k+= 4;
172 |         }
173 |         #endif
174 |         while (k++ < D_fm) *z_fmik++ -= lr * (*w_fmk++ - *z_fmik * reg); // Tail end
175 | 
176 |         n_fm[i] += e_sq;
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/wordbatch/models/avx_ext.h:
--------------------------------------------------------------------------------
 1 | #ifndef _AVX_EXT_H
 2 | #define _AVX_EXT_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta,
 9 |     double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm, int D_fm,
10 |     int bias_term, int nThreads);
11 | 
12 | int halfer_EXT_INT(int d);
13 | 
14 | void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w, double* z,
15 |     double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm, double* n_fm, int D_fm, int bias_term,
16 |     int nThreads);
17 | 
18 | int doubler_EXT(int d);
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif   // _AVX_EXT_H
25 | 


--------------------------------------------------------------------------------
/wordbatch/models/fm_ftrl.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False, wraparound=False, cdivision=True
  2 | import numpy as np
  3 | import gzip
  4 | cimport cython
  5 | from cpython cimport array
  6 | import scipy.sparse as ssp
  7 | cimport numpy as np
  8 | from cython.parallel import prange
  9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs
 10 | import multiprocessing
 11 | import sys
 12 | import randomgen
 13 | 
 14 | np.import_array()
 15 | 
 16 | cdef extern from "avx_ext.h":# nogil:
 17 | 	void update_fm_ftrl_avx(const int* inds, double* vals, int lenn, const double e, double ialpha, double* w,
 18 | 							double* z, double* n, double alpha_fm, const double L2_fm, double* w_fm, double* z_fm,
 19 | 							double* n_fm, int D_fm, int bias_term, int nThreads);
 20 | 	double predict_fm_ftrl_avx(const int* inds, double* vals, int lenn, double L1, double baL2, double ialpha,
 21 | 							   double beta, double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm,
 22 | 							   double weight_fm, int D_fm, int bias_term, int nThreads);
 23 | 
 24 | 
 25 | cdef double inv_link_f(double e, int inv_link) nogil:
 26 | 	if inv_link==1:  return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss
 27 | 	return e
 28 | 
 29 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta,
 30 | 						   double* w, double* z, double* n, double* w_fm, double* z_fm, double* n_fm, double weight_fm,
 31 | 						   int D_fm, bint bias_term, int threads) nogil:
 32 | 	cdef int i, ii, k
 33 | 	cdef double sign, zi, d, wi, wi2, wfmk, e= 0.0, e2= 0.0
 34 | 
 35 | 	if bias_term:
 36 | 		if z[0] != 0:
 37 | 			wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha)
 38 | 			e += wi
 39 | 		else:  w[0] = 0.0
 40 | 
 41 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 42 | 		i= inds[ii]+1
 43 | 		zi= z[i]
 44 | 		sign= -1.0 if zi < 0 else 1.0
 45 | 		if sign * zi  > L1:
 46 | 			w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2)
 47 | 			e+= wi * vals[ii]
 48 | 		else:  w[ii+1] = 0.0
 49 | 
 50 | 	wi2= 0.0
 51 | 	for k in prange(D_fm, nogil=True, num_threads=threads):
 52 | 		wfmk= 0.0
 53 | 		for ii in range(lenn):
 54 | 			d= z_fm[inds[ii] * D_fm + k] * vals[ii]
 55 | 			wfmk= wfmk+d
 56 | 			wi2+= d **2
 57 | 		e2+= wfmk **2
 58 | 		w_fm[k]= wfmk
 59 | 	e2= (e2- wi2)* 0.5 *weight_fm
 60 | 	return e+e2
 61 | 
 62 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, double* w, double* z, double* n,
 63 | 						double alpha_fm, double L2_fm, double* w_fm, double* z_fm, double* n_fm,
 64 | 						int D_fm, bint bias_term, int threads) nogil:
 65 | 	cdef int i, ii, k
 66 | 	cdef double g, g2, ni, v, lr, e2= e**2, reg, L2_fme= L2_fm / e
 67 | 	cdef double *z_fmi
 68 | 	if bias_term: #Update bias with FTRL-proximal
 69 | 		g2= e ** 2
 70 | 		ni= n[0]
 71 | 		z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0]
 72 | 		n[0]+= g2
 73 | 
 74 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 75 | 	#for ii in range(lenn):
 76 | 		i= inds[ii]+1
 77 | 		v= vals[ii]
 78 | 		#Update 1st order model with FTRL-proximal
 79 | 		g= e * v
 80 | 		g2= g * g
 81 | 		ni= n[i]
 82 | 		z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i]
 83 | 		n[i]+= g2
 84 | 
 85 | 		#Update FM with adaptive regularized SGD
 86 | 		z_fmi= z_fm+ (i-1) * D_fm
 87 | 		lr= g* alpha_fm / (sqrt(n_fm[i])+1.0)
 88 | 		reg= v - L2_fme
 89 | 		for k in range(D_fm):  z_fmi[k]-= lr * (w_fm[k] - z_fmi[k] * reg)
 90 | 		n_fm[i] += e2
 91 | 
 92 | cdef class FM_FTRL:
 93 | 	cdef const double[:] w
 94 | 	cdef const double[:] z
 95 | 	cdef const double[:] n
 96 | 	cdef const double[:] w_fm
 97 | 	cdef const double[:] z_fm
 98 | 	cdef const double[:] n_fm
 99 | 
100 | 	cdef unsigned int threads
101 | 	cdef unsigned int iters
102 | 	cdef unsigned int D
103 | 	cdef unsigned int D_fm
104 | 	cdef double L1
105 | 	cdef double L2
106 | 	cdef double alpha
107 | 	cdef double beta
108 | 	cdef double alpha_fm
109 | 	cdef double L2_fm
110 | 	cdef double weight_fm
111 | 	cdef double init_fm
112 | 	cdef double e_noise
113 | 	cdef double e_clip
114 | 	cdef int inv_link
115 | 	cdef bint bias_term
116 | 	cdef int use_avx
117 | 	cdef int seed
118 | 	cdef int verbose
119 | 
120 | 	def __init__(self,
121 | 				 double alpha=0.02,
122 | 				 double beta=0.01, # ~ alpha/2
123 | 				 double L1=0.0001,
124 | 				 double L2=0.1,
125 | 				 unsigned int D=0,
126 | 				 double alpha_fm=0.03,
127 | 				 double L2_fm= 0.005,
128 | 				 double init_fm= 0.01,
129 | 				 unsigned int D_fm=20,
130 | 				 double weight_fm= 10.0,
131 | 				 double e_noise= 0.0001,
132 | 				 double e_clip= 1.0,
133 | 				 unsigned int iters=5,
134 | 				 inv_link= "identity",
135 | 				 bint bias_term=1,
136 | 				 int threads= 0,
137 | 				 int use_avx=1,
138 | 				 int seed= 0,
139 | 				 int verbose=1):
140 | 
141 | 		self.alpha= alpha
142 | 		self.beta= beta
143 | 		self.L1= L1
144 | 		self.L2= L2
145 | 		self.D= D
146 | 		self.alpha_fm= alpha_fm
147 | 		self.L2_fm= L2_fm
148 | 		self.init_fm= init_fm
149 | 		self.D_fm= D_fm
150 | 		self.weight_fm= weight_fm
151 | 		self.e_noise= e_noise
152 | 		self.e_clip= e_clip
153 | 		self.iters= iters
154 | 		if threads==0:  threads= multiprocessing.cpu_count()-1
155 | 		self.threads= threads
156 | 		if inv_link=="sigmoid":  self.inv_link= 1
157 | 		if inv_link=="identity":  self.inv_link= 0
158 | 		self.bias_term= bias_term
159 | 		self.use_avx = use_avx
160 | 		self.seed = seed
161 | 		self.verbose= verbose
162 | 		self.reset()
163 | 
164 | 	def reset(self):
165 | 		D= self.D
166 | 		D_fm= self.D_fm
167 | 		self.w = np.ones((D+1), dtype=np.float64)
168 | 		self.z = np.zeros((D+1), dtype=np.float64)
169 | 		self.n = np.zeros((D+1), dtype=np.float64)
170 | 		self.w_fm = np.zeros(D_fm, dtype=np.float64)
171 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
172 | 		self.z_fm = (rand.random(D * D_fm) - 0.5) * self.init_fm
173 | 		self.n_fm = np.zeros(D+1, dtype=np.float64)
174 | 
175 | 	def predict(self, X, int threads= 0):
176 | 		if threads==0:  threads= self.threads
177 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
178 | 		if X.shape[1] != self.D:
179 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
180 | 		# return self.predict_f(np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
181 | 		# 					  np.ascontiguousarray(X.indptr), threads)
182 | 		return self.predict_f(X.data, X.indices, X.indptr, threads)
183 | 
184 | 	def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
185 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
186 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads):
187 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, \
188 | 					weight_fm= self.weight_fm
189 | 		cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *n_fm= &self.n_fm[0], \
190 | 					*z_fm= &self.z_fm[0], *w_fm= &self.w_fm[0]
191 | 		cdef unsigned int D_fm= self.D_fm, k
192 | 		p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64)
193 | 		cdef double[:] pp= p
194 | 		cdef unsigned int lenn, row_count= X_indptr.shape[0]-1, row, ptr
195 | 		cdef bint bias_term= self.bias_term
196 | 		for row in range(row_count):
197 | 			ptr= X_indptr[row]
198 | 			lenn= X_indptr[row + 1] - ptr
199 | 			inds= <int*> X_indices.data + ptr
200 | 			vals= <double*> X_data.data + ptr
201 | 
202 | 			if self.use_avx == 1:
203 | 				pp[row]= inv_link_f(predict_fm_ftrl_avx(inds, vals, lenn,
204 | 												   L1, baL2, ialpha, beta, w, z, n,
205 | 												   w_fm, z_fm, n_fm, weight_fm,
206 | 												   D_fm, bias_term, threads), self.inv_link)
207 | 			else:
208 | 				pp[row]= inv_link_f(predict_single(inds, vals, lenn,
209 | 												   L1, baL2, ialpha, beta, w, z, n,
210 | 												   w_fm, z_fm, n_fm, weight_fm,
211 | 												   D_fm, bias_term, threads), self.inv_link)
212 | 		return p
213 | 
214 | 
215 | 	def partial_fit(self, X, y, sample_weight= None, int threads = 0, int seed = 0):
216 | 		return self.fit(X, y, sample_weight= sample_weight, threads = threads, seed = seed, reset= False)
217 | 
218 | 	def fit(self, X, y, sample_weight= None, int threads= 0, int seed= 0, reset= True):
219 | 		if threads == 0:  threads= self.threads
220 | 		if type(X) != ssp.csr.csr_matrix:  X = ssp.csr_matrix(X, dtype=np.float64)
221 | 		if reset or self.D==0:
222 | 			self.D= X.shape[1]
223 | 			self.reset()
224 | 		elif X.shape[1] != self.D:
225 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
226 | 		#if type(y) != np.array:  y = np.array(y, dtype=np.float64)
227 | 		y= np.ascontiguousarray(y, dtype=np.float64)
228 | 		if sample_weight is not None and type(sample_weight) != np.array:
229 | 			sample_weight= np.array(sample_weight, dtype=np.float64)
230 | 		return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads, seed)
231 | 
232 | 	def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
233 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
234 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr,
235 | 					np.ndarray[double, ndim=1, mode='c'] y,
236 | 					sample_weight,
237 | 					int threads, int seed):
238 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, \
239 | 					alpha_fm= self.alpha_fm, weight_fm= self.weight_fm, L2_fm= self.L2_fm, e, e_total= 0, zfmi, \
240 | 					e_noise= self.e_noise, e_clip= self.e_clip, abs_e
241 | 		cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *n_fm= &self.n_fm[0], \
242 | 					*z_fm= &self.z_fm[0], *w_fm= &self.w_fm[0], *ys= <double*> y.data
243 | 		cdef int D_fm= self.D_fm, lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link
244 | 		cdef bint bias_term= self.bias_term
245 | 		cdef int* inds, indptr
246 | 		cdef double* vals
247 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
248 | 		for iter in range(self.iters):
249 | 			e_total= 0.0
250 | 			for row in range(row_count):
251 | 				ptr= X_indptr[row]
252 | 				lenn= X_indptr[row+1]-ptr
253 | 				inds= <int*> X_indices.data+ptr
254 | 				vals= <double*> X_data.data+ptr
255 | 
256 | 				if self.use_avx == 1:
257 | 					e = inv_link_f(predict_fm_ftrl_avx(inds, vals, lenn,
258 | 													   L1, baL2, ialpha, beta, w, z, n,
259 | 													   w_fm, z_fm, n_fm, weight_fm,
260 | 													   D_fm, bias_term, threads), inv_link) - ys[row]
261 | 				else:
262 | 					e= inv_link_f(predict_single(inds, vals, lenn,
263 | 												L1, baL2, ialpha, beta, w, z, n,
264 | 												w_fm, z_fm, n_fm, weight_fm,
265 | 												D_fm, bias_term, threads), inv_link) -ys[row]
266 | 
267 | 				abs_e= fabs(e)
268 | 				e_total+= abs_e
269 | 				e += (rand.random() - 0.5) * e_noise
270 | 				if abs_e> e_clip:
271 | 					if e>0:  e= e_clip
272 | 					else:  e= -e_clip
273 | 				if sample_weight is not None:
274 | 					e*= sample_weight[row]
275 | 
276 | 				if self.use_avx == 1:
277 | 					update_fm_ftrl_avx(inds, vals, lenn, e, ialpha, w, z, n, alpha_fm, L2_fm, w_fm, z_fm, n_fm, D_fm,
278 | 										 bias_term, threads)
279 | 				else:
280 | 					update_single(inds, vals, lenn, e, ialpha, w, z, n, alpha_fm, L2_fm, w_fm, z_fm, n_fm, D_fm,
281 | 								  bias_term, threads)
282 | 
283 | 			if self.verbose>0:  print "Total e:", e_total
284 | 		return self
285 | 
286 | 	def __getstate__(self):
287 | 		return (self.alpha,
288 | 				self.beta,
289 | 				self.L1,
290 | 				self.L2,
291 | 				self.alpha_fm,
292 | 				self.L2_fm,
293 | 				self.e_noise,
294 | 				self.e_clip,
295 | 				self.weight_fm,
296 | 				self.init_fm,
297 | 				self.D,
298 | 				self.D_fm,
299 | 				self.iters,
300 | 				np.asarray(self.w),
301 | 				np.asarray(self.z),
302 | 				np.asarray(self.n),
303 | 				np.asarray(self.w_fm),
304 | 				np.asarray(self.z_fm),
305 | 				np.asarray(self.n_fm),
306 | 				self.inv_link,
307 | 				self.seed,
308 | 				self.use_avx,
309 | 				self.bias_term,
310 | 				self.threads,
311 | 				self.verbose)
312 | 
313 | 	def __setstate__(self, params):
314 | 		(self.alpha,
315 | 		 self.beta,
316 | 		 self.L1,
317 | 		 self.L2,
318 | 		 self.alpha_fm,
319 | 		 self.L2_fm,
320 | 		 self.e_noise,
321 | 		 self.e_clip,
322 | 		 self.weight_fm,
323 | 		 self.init_fm,
324 | 		 self.D,
325 | 		 self.D_fm,
326 | 		 self.iters,
327 | 		 self.w,
328 | 		 self.z,
329 | 		 self.n,
330 | 		 self.w_fm,
331 | 		 self.z_fm,
332 | 		 self.n_fm,
333 | 		 self.inv_link,
334 | 		 self.seed,
335 | 		 self.use_avx,
336 | 		 self.bias_term,
337 | 		 self.threads,
338 | 		 self.verbose)= params
339 | 


--------------------------------------------------------------------------------
/wordbatch/models/ftrl.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False, wraparound=False, cdivision=True
  2 | import numpy as np
  3 | cimport cython
  4 | from cpython cimport array
  5 | import scipy.sparse as ssp
  6 | cimport numpy as np
  7 | from cython.parallel import prange
  8 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs
  9 | import multiprocessing
 10 | import randomgen
 11 | 
 12 | np.import_array()
 13 | 
 14 | cdef double inv_link_f(double e, int inv_link) nogil:
 15 | 	if inv_link==1:  return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss
 16 | 	return e
 17 | 
 18 | cdef double predict_single_finalized(int* inds, double* vals, int lenn, double* w, bint bias_term, int threads) nogil:
 19 | 	cdef int ii
 20 | 	cdef double e= 0.0
 21 | 	if bias_term:
 22 | 		e += w[0]
 23 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 24 | 		e+= w[inds[ii]+1] * vals[ii]
 25 | 	return e
 26 | 
 27 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta,
 28 | 				double* w, double* z, double* n, bint bias_term, int threads) nogil:
 29 | 	cdef int i, ii
 30 | 	cdef double sign, zi, wi
 31 | 	cdef double e= 0.0
 32 | 	if bias_term:
 33 | 		if z[0] != 0:
 34 | 			wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha)
 35 | 			e += wi
 36 | 		else:  w[0] = 0.0
 37 | 
 38 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 39 | 		i= inds[ii]+1
 40 | 		zi= z[i]
 41 | 		sign = -1.0 if zi < 0 else 1.0
 42 | 		if sign * zi  > L1:
 43 | 			w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2)
 44 | 			e+= wi * vals[ii]
 45 | 		else:  w[i]= 0.0
 46 | 	return e
 47 | 
 48 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, double* w, double* z,
 49 | 						double* n, bint bias_term, int threads) nogil:
 50 | 	cdef int i, ii
 51 | 	cdef double g, g2, ni
 52 | 	if bias_term:
 53 | 		g2= e ** 2
 54 | 		ni= n[0]
 55 | 		z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0]
 56 | 		n[0]+= g2
 57 | 
 58 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 59 | 		i= inds[ii]+1
 60 | 		g= e * vals[ii]
 61 | 		g2= g ** 2
 62 | 		ni= n[i]
 63 | 		#z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[ii+1]
 64 | 		z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i]
 65 | 		n[i]+= g2
 66 | 
 67 | cdef class FTRL:
 68 | 	cdef const double[:] w
 69 | 	cdef const double[:] z
 70 | 	cdef const double[:] n
 71 | 
 72 | 	cdef unsigned int threads
 73 | 	cdef unsigned int iters
 74 | 	cdef unsigned int D
 75 | 	cdef double L1
 76 | 	cdef double L2
 77 | 	cdef double alpha
 78 | 	cdef double beta
 79 | 	cdef double init
 80 | 	cdef double e_clip
 81 | 	cdef int inv_link
 82 | 	cdef bint bias_term
 83 | 	cdef int seed
 84 | 	cdef int verbose
 85 | 	cdef bint model_finalized
 86 | 
 87 | 	def __init__(self,
 88 | 				 double alpha=0.1,
 89 | 				 double beta=1.0,
 90 | 				 double L1=1.0,
 91 | 				 double L2=1.0,
 92 | 				 unsigned int D=0,
 93 | 				 double init= 0.0,
 94 | 				 unsigned int iters=10,
 95 | 				 double e_clip= 1.0,
 96 | 				 int threads= 0,
 97 | 				 inv_link= "sigmoid",
 98 | 				 bint bias_term=1,
 99 | 				 int seed= 0,
100 | 				 int verbose=1):
101 | 
102 | 		self.alpha= alpha
103 | 		self.beta= beta
104 | 		self.L1= L1
105 | 		self.L2= L2
106 | 		self.init= init
107 | 		self.e_clip= e_clip
108 | 		self.D= D
109 | 		self.iters= iters
110 | 		if threads==0:  threads= multiprocessing.cpu_count()-1
111 | 		self.threads= threads
112 | 		if inv_link=="sigmoid":  self.inv_link= 1
113 | 		if inv_link=="identity":  self.inv_link= 0
114 | 		self.bias_term= bias_term
115 | 		self.seed = seed
116 | 		self.verbose= verbose
117 | 		self.model_finalized= False
118 | 		self.reset()
119 | 
120 | 	def reset(self):
121 | 		D= self.D
122 | 		self.w = np.zeros((D+1,), dtype=np.float64)
123 | 		if self.init==0:
124 | 			self.z = np.zeros((D+1,), dtype=np.float64)
125 | 		else:
126 | 			rand= randomgen.xoroshiro128.Xoroshiro128(seed= self.seed).generator
127 | 			self.z = (rand.random_sample(D+1) - 0.5) * self.init
128 | 		self.n = np.zeros((D+1,), dtype=np.float64)
129 | 		self.model_finalized= False
130 | 
131 | 	def predict(self, X, int threads= 0):
132 | 		if threads==0:  threads= self.threads
133 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
134 | 		if X.shape[1] != self.D:
135 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
136 | 		# return self.predict_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
137 | 		#               np.ascontiguousarray(X.indptr), threads)
138 | 		return self.predict_f(X.data, X.indices, X.indptr, threads)
139 | 
140 | 	def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
141 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
142 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads):
143 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2
144 | 		p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64)
145 | 		cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0]
146 | 		cdef double[:] pp= p
147 | 		cdef unsigned lenn, row_count= X_indptr.shape[0]-1, row, ptr
148 | 		cdef bint bias_term= self.bias_term
149 | 		for row in range(row_count):
150 | 			ptr= X_indptr[row]
151 | 			lenn= X_indptr[row + 1] - ptr
152 | 			inds= <int*> X_indices.data + ptr
153 | 			vals= <double*> X_data.data + ptr
154 | 			if self.model_finalized:
155 | 				pp[row]= inv_link_f(predict_single_finalized(inds, vals, lenn, w, bias_term, threads), self.inv_link)
156 | 			else:
157 | 				pp[row]= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n,
158 | 												   bias_term, threads), self.inv_link)
159 | 		return p
160 | 
161 | 	def partial_fit(self, X, y, sample_weight= None, int threads = 0):
162 | 		return self.fit(X, y, sample_weight= sample_weight, threads = threads, reset= False)
163 | 
164 | 	def fit(self, X, y, sample_weight= None, int threads= 0, reset= True):
165 | 		if threads == 0:  threads= self.threads
166 | 		if type(X) != ssp.csr.csr_matrix:  X = ssp.csr_matrix(X, dtype=np.float64)
167 | 		if reset or self.D==0:
168 | 			self.D= X.shape[1]
169 | 			self.reset()
170 | 		elif X.shape[1] != self.D:
171 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
172 | 		#if type(y) != np.array:  y = np.array(y, dtype=np.float64)
173 | 		y= np.ascontiguousarray(y, dtype=np.float64)
174 | 		if sample_weight is not None and type(sample_weight) != np.array:
175 | 			sample_weight= np.array(sample_weight, dtype=np.float64)
176 | 		# self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
177 | 		#           np.ascontiguousarray(X.indptr), y, threads)
178 | 		return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads)
179 | 
180 | 	def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
181 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
182 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr,
183 | 					np.ndarray[double, ndim=1, mode='c'] y,
184 | 					sample_weight, int threads):
185 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, e, e_total= 0,\
186 | 					e_clip= self.e_clip, abs_e
187 | 		cdef double *w= &self.w[0], *z= &self.z[0], *n= &self.n[0], *ys= <double*> y.data
188 | 		cdef int lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link, j=0, jj
189 | 		cdef bint bias_term= self.bias_term
190 | 		cdef int* inds, indptr
191 | 		cdef double* vals
192 | 		for iter in range(self.iters):
193 | 			e_total= 0.0
194 | 			for row in range(row_count):
195 | 				ptr= X_indptr[row]
196 | 				lenn= X_indptr[row+1]-ptr
197 | 				inds= <int*> X_indices.data+ptr
198 | 				vals= <double*> X_data.data+ptr
199 | 				e= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, bias_term, threads),
200 | 					inv_link)-ys[row]
201 | 				abs_e= fabs(e)
202 | 				e_total+= abs_e
203 | 				if abs_e > e_clip:
204 | 					if e > 0:  e= e_clip
205 | 					else:  e= -e_clip
206 | 				if sample_weight is not None:
207 | 					e*= sample_weight[row]
208 | 				update_single(inds, vals, lenn, e, ialpha, w, z, n, bias_term, threads)
209 | 			if self.verbose > 0:  print "Total e:", e_total
210 | 		return self
211 | 
212 | 	def finalize_model(self):
213 | 		D= self.D
214 | 		indices = np.arange(start=0, stop=D, step=1, dtype=np.int32)
215 | 		indptr= np.array([0, D], dtype=np.int32)
216 | 		data = np.zeros(D, dtype=np.float64)
217 | 		self.predict_f(data, indices, indptr, threads= self.threads)
218 | 		del(indices, indptr, data)
219 | 		self.z = np.zeros(0, dtype=np.float64)
220 | 		self.n = np.zeros(0, dtype=np.float64)
221 | 		self.model_finalized= True
222 | 
223 | 	def __getstate__(self):
224 | 		return (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters,
225 | 				np.asarray(self.w), np.asarray(self.z), np.asarray(self.n), self.inv_link, self.threads, self.bias_term,
226 | 				self.model_finalized, self.verbose)
227 | 
228 | 	def __setstate__(self, params):
229 | 		(self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, self.w,
230 | 		self.z, self.n, self.inv_link, self.threads, self.bias_term, self.model_finalized, self.verbose)= params
231 | 


--------------------------------------------------------------------------------
/wordbatch/models/ftrl32.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False, wraparound=False, cdivision=True
  2 | import numpy as np
  3 | cimport cython
  4 | from cpython cimport array
  5 | import scipy.sparse as ssp
  6 | cimport numpy as np
  7 | from cython.parallel import prange
  8 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs
  9 | import multiprocessing
 10 | import randomgen
 11 | 
 12 | np.import_array()
 13 | 
 14 | cdef double inv_link_f(double e, int inv_link) nogil:
 15 | 	if inv_link==1:  return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss
 16 | 	return e
 17 | 
 18 | cdef double predict_single_finalized(int* inds, double* vals, int lenn, float* w, bint bias_term, int threads) nogil:
 19 | 	cdef int ii
 20 | 	cdef double e= 0.0
 21 | 	if bias_term:
 22 | 		e += w[0]
 23 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 24 | 		e+= w[inds[ii]+1] * vals[ii]
 25 | 	return e
 26 | 
 27 | cdef double predict_single(int* inds, double* vals, int lenn, double L1, double baL2, double ialpha, double beta,
 28 | 				float* w, float* z, float* n, bint bias_term, int threads) nogil:
 29 | 	cdef int i, ii
 30 | 	cdef double sign, zi, wi
 31 | 	cdef double e= 0.0
 32 | 	if bias_term:
 33 | 		if z[0] != 0:
 34 | 			wi = w[0] = -z[0] / ((beta + sqrt(n[0])) * ialpha)
 35 | 			e += wi
 36 | 		else:  w[0] = 0.0
 37 | 
 38 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 39 | 		i= inds[ii]+1
 40 | 		zi= z[i]
 41 | 		sign = -1.0 if zi < 0 else 1.0
 42 | 		if sign * zi  > L1:
 43 | 			w[i]= wi= (sign * L1 - zi) / (sqrt(n[i]) * ialpha + baL2)
 44 | 			e+= wi * vals[ii]
 45 | 		else:  w[i]= 0.0
 46 | 	return e
 47 | 
 48 | cdef void update_single(int* inds, double* vals, int lenn, double e, double ialpha, float* w, float* z,
 49 | 						float* n, bint bias_term, int threads) nogil:
 50 | 	cdef int i, ii
 51 | 	cdef double g, g2, ni
 52 | 	if bias_term:
 53 | 		g2= e ** 2
 54 | 		ni= n[0]
 55 | 		z[0]+= e - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[0]
 56 | 		n[0]+= g2
 57 | 
 58 | 	for ii in prange(lenn, nogil=True, num_threads= threads):
 59 | 		i= inds[ii]+1
 60 | 		g= e * vals[ii]
 61 | 		g2= g ** 2
 62 | 		ni= n[i]
 63 | 		#z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[ii+1]
 64 | 		z[i]+= g - ((sqrt(ni + g2) - sqrt(ni)) * ialpha) * w[i]
 65 | 		n[i]+= g2
 66 | 
 67 | cdef class FTRL32:
 68 | 	cdef const float[:] w
 69 | 	cdef const float[:] z
 70 | 	cdef const float[:] n
 71 | 
 72 | 	cdef unsigned int threads
 73 | 	cdef unsigned int iters
 74 | 	cdef unsigned int D
 75 | 	cdef double L1
 76 | 	cdef double L2
 77 | 	cdef double alpha
 78 | 	cdef double beta
 79 | 	cdef double init
 80 | 	cdef double e_clip
 81 | 	cdef int inv_link
 82 | 	cdef bint bias_term
 83 | 	cdef int seed
 84 | 	cdef int verbose
 85 | 	cdef bint model_finalized
 86 | 
 87 | 	def __init__(self,
 88 | 				 double alpha=0.1,
 89 | 				 double beta=1.0,
 90 | 				 double L1=1.0,
 91 | 				 double L2=1.0,
 92 | 				 unsigned int D=0,
 93 | 				 double init= 0.0,
 94 | 				 unsigned int iters=10,
 95 | 				 double e_clip= 1.0,
 96 | 				 int threads= 0,
 97 | 				 inv_link= "sigmoid",
 98 | 				 bint bias_term=1,
 99 | 				 int seed= 0,
100 | 				 int verbose=1):
101 | 
102 | 		self.alpha= alpha
103 | 		self.beta= beta
104 | 		self.L1= L1
105 | 		self.L2= L2
106 | 		self.init= init
107 | 		self.e_clip= e_clip
108 | 		self.D= D
109 | 		self.iters= iters
110 | 		if threads==0:  threads= multiprocessing.cpu_count()-1
111 | 		self.threads= threads
112 | 		if inv_link=="sigmoid":  self.inv_link= 1
113 | 		if inv_link=="identity":  self.inv_link= 0
114 | 		self.bias_term= bias_term
115 | 		self.seed = seed
116 | 		self.verbose= verbose
117 | 		self.model_finalized= False
118 | 		self.reset()
119 | 
120 | 	def reset(self):
121 | 		D= self.D
122 | 		self.w = np.zeros((D+1,), dtype=np.float32)
123 | 		if self.init==0:
124 | 			self.z = np.zeros((D+1,), dtype=np.float32)
125 | 		else:
126 | 			rand= randomgen.xoroshiro128.Xoroshiro128(seed= self.seed).generator
127 | 			self.z = np.float32((rand.random_sample(D+1) - 0.5) * self.init)
128 | 		self.n = np.zeros((D+1,), dtype=np.float32)
129 | 		self.model_finalized= False
130 | 
131 | 	def predict(self, X, int threads= 0):
132 | 		if threads==0:  threads= self.threads
133 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
134 | 		if X.shape[1] != self.D:
135 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
136 | 		# return self.predict_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
137 | 		#               np.ascontiguousarray(X.indptr), threads)
138 | 		return self.predict_f(X.data, X.indices, X.indptr, threads)
139 | 
140 | 	def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
141 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
142 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads):
143 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2
144 | 		p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64)
145 | 		cdef float *w= &self.w[0], *z= &self.z[0], *n= &self.n[0]
146 | 		cdef double[:] pp= p
147 | 		cdef unsigned lenn, row_count= X_indptr.shape[0]-1, row, ptr
148 | 		cdef bint bias_term= self.bias_term
149 | 		for row in range(row_count):
150 | 			ptr= X_indptr[row]
151 | 			lenn= X_indptr[row + 1] - ptr
152 | 			inds= <int*> X_indices.data + ptr
153 | 			vals= <double*> X_data.data + ptr
154 | 			if self.model_finalized:
155 | 				pp[row]= inv_link_f(predict_single_finalized(inds, vals, lenn, w, bias_term, threads), self.inv_link)
156 | 			else:
157 | 				pp[row]= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n,
158 | 												   bias_term, threads), self.inv_link)
159 | 		return p
160 | 
161 | 	def partial_fit(self, X, y, sample_weight= None, int threads = 0):
162 | 		return self.fit(X, y, sample_weight= sample_weight, threads = threads, reset= False)
163 | 
164 | 	def fit(self, X, y, sample_weight= None, int threads= 0, reset= True):
165 | 		if threads == 0:  threads= self.threads
166 | 		if type(X) != ssp.csr.csr_matrix:  X = ssp.csr_matrix(X, dtype=np.float64)
167 | 		if reset or self.D==0:
168 | 			self.D= X.shape[1]
169 | 			self.reset()
170 | 		elif X.shape[1] != self.D:
171 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
172 | 		#if type(y) != np.array:  y = np.array(y, dtype=np.float64)
173 | 		y= np.ascontiguousarray(y, dtype=np.float64)
174 | 		if sample_weight is not None and type(sample_weight) != np.array:
175 | 			sample_weight= np.array(sample_weight, dtype=np.float64)
176 | 		# self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
177 | 		#           np.ascontiguousarray(X.indptr), y, threads)
178 | 		return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads)
179 | 
180 | 	def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
181 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
182 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr,
183 | 					np.ndarray[double, ndim=1, mode='c'] y,
184 | 					sample_weight, int threads):
185 | 		cdef double ialpha= 1.0/self.alpha, L1= self.L1, beta= self.beta, baL2= beta * ialpha + self.L2, e, e_total= 0,\
186 | 					e_clip= self.e_clip, abs_e
187 | 		cdef float *w= &self.w[0], *z= &self.z[0], *n= &self.n[0]
188 | 		cdef double *ys= <double*> y.data
189 | 		cdef int lenn, ptr, row_count= X_indptr.shape[0]-1, row, inv_link= self.inv_link, j=0, jj
190 | 		cdef bint bias_term= self.bias_term
191 | 		cdef int* inds, indptr
192 | 		cdef double* vals
193 | 		for iter in range(self.iters):
194 | 			e_total= 0.0
195 | 			for row in range(row_count):
196 | 				ptr= X_indptr[row]
197 | 				lenn= X_indptr[row+1]-ptr
198 | 				inds= <int*> X_indices.data+ptr
199 | 				vals= <double*> X_data.data+ptr
200 | 				e= inv_link_f(predict_single(inds, vals, lenn, L1, baL2, ialpha, beta, w, z, n, bias_term, threads),
201 | 					inv_link)-ys[row]
202 | 				abs_e= fabs(e)
203 | 				e_total+= abs_e
204 | 				if abs_e > e_clip:
205 | 					if e > 0:  e= e_clip
206 | 					else:  e= -e_clip
207 | 				if sample_weight is not None:
208 | 					e*= sample_weight[row]
209 | 				update_single(inds, vals, lenn, e, ialpha, w, z, n, bias_term, threads)
210 | 			if self.verbose > 0:  print "Total e:", e_total
211 | 		return self
212 | 
213 | 	def finalize_model(self):
214 | 		D= self.D
215 | 		indices = np.arange(start=0, stop=D, step=1, dtype=np.int32)
216 | 		indptr= np.array([0, D], dtype=np.int32)
217 | 		data = np.zeros(D, dtype=np.float64)
218 | 		self.predict_f(data, indices, indptr, threads= self.threads)
219 | 		del(indices, indptr, data)
220 | 		self.z = np.zeros(0, dtype=np.float32)
221 | 		self.n = np.zeros(0, dtype=np.float32)
222 | 		self.model_finalized= True
223 | 
224 | 	def __getstate__(self):
225 | 		return (self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters,
226 | 				np.asarray(self.w), np.asarray(self.z), np.asarray(self.n), self.inv_link, self.threads, self.bias_term,
227 | 				self.model_finalized, self.verbose)
228 | 
229 | 	def __setstate__(self, params):
230 | 		(self.alpha, self.beta, self.L1, self.L2, self.e_clip, self.D, self.init, self.seed, self.iters, self.w,
231 | 		self.z, self.n, self.inv_link, self.threads, self.bias_term, self.model_finalized, self.verbose)= params
232 | 


--------------------------------------------------------------------------------
/wordbatch/models/nn_relu_h1.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False, wraparound=False, cdivision=True
  2 | import numpy as np
  3 | import gzip
  4 | cimport cython
  5 | from cpython cimport array
  6 | import scipy.sparse as ssp
  7 | cimport numpy as np
  8 | from cython.parallel import prange
  9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs
 10 | import multiprocessing
 11 | import sys
 12 | import randomgen
 13 | 
 14 | np.import_array()
 15 | 
 16 | cdef double inv_link_f(double e, int inv_link) nogil:
 17 | 	if inv_link==1:  return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss
 18 | 	return e
 19 | 
 20 | cdef double predict_single(int* inds, double* vals, int lenn, int D, int D_nn,
 21 | 				double* w0, double* w1, double* z, int threads) nogil:
 22 | 	cdef int j, i, ii, DD_nn= D*D_nn
 23 | 	cdef double p, v, zj
 24 | 	p= w1[D_nn]
 25 | 	for j in prange(D_nn, nogil=True, num_threads= threads):
 26 | 		zj= w0[DD_nn+j]
 27 | 		for ii in range(lenn):
 28 | 			zj= zj+ w0[inds[ii]*D_nn+j] * vals[ii]
 29 | 		if zj<=0:  z[j]= 0
 30 | 		else:
 31 | 			z[j]= zj
 32 | 			p+= w1[j] * zj
 33 | 	return p
 34 | 
 35 | cdef void update_single(int* inds, double* vals, int lenn, int D, int D_nn, double e, double alpha,
 36 | 						double L2,  double* w0, double* w1, double* z, double* c0, double* c1, int threads) nogil:
 37 | 	cdef int i, ii, j, DD_nn= D*D_nn, iDnnj
 38 | 	cdef double dldy= e, dldz, dldw1, dldw0
 39 | 	w1[D_nn]-= (dldy+ L2 * w1[D_nn]) * alpha
 40 | 	#for j in prange(D_nn, nogil=True, num_threads=threads):
 41 | 	for j in range(D_nn):
 42 | 		if z[j]==0:  continue
 43 | 		dldw1= dldy * z[j]
 44 | 		w1[j]-= (dldw1 + L2 * w1[j]) * alpha/(sqrt(c1[j])+1)
 45 | 		dldz= dldy * w1[j]
 46 | 		w0[DD_nn+j]-= (dldz+ L2 *w0[DD_nn+j]) * alpha/(sqrt(c1[j])+1)
 47 | 
 48 | 		#for ii in range(lenn):
 49 | 		for ii in prange(lenn, nogil=True, num_threads=threads):
 50 | 			i= inds[ii]
 51 | 			dldw0= dldz * vals[ii]
 52 | 			w0[i * D_nn + j]-= (dldw0 + L2 *w0[i * D_nn + j]) * alpha/(sqrt(c0[i])+1)
 53 | 			c0[i]+= fabs(dldw0)
 54 | 		c1[j]+= fabs(dldw1)
 55 | 
 56 | cdef class NN_ReLU_H1:
 57 | 	cdef const double[:] w0
 58 | 	cdef const double[:] w1
 59 | 	cdef const double[:] z
 60 | 	cdef const double[:] c0
 61 | 	cdef const double[:] c1
 62 | 
 63 | 	cdef unsigned int threads
 64 | 	cdef unsigned int iters
 65 | 	cdef int D
 66 | 	cdef int D_nn
 67 | 	cdef double init_nn
 68 | 
 69 | 	cdef double L2
 70 | 	cdef double alpha
 71 | 	cdef double e_noise
 72 | 	cdef double e_clip
 73 | 	cdef int inv_link
 74 | 	cdef int seed
 75 | 	cdef int verbose
 76 | 
 77 | 	def __init__(self,
 78 | 				 double alpha=0.1,
 79 | 				 double L2=0.001,
 80 | 				 int D=0,
 81 | 				 int D_nn=30,
 82 | 				 double init_nn=0.01,
 83 | 				 double e_noise=0.0001,
 84 | 				 double e_clip=1.0,
 85 | 				 unsigned int iters=4,
 86 | 				 inv_link= "identity",
 87 | 				 int threads= 0,
 88 | 				 int seed= 0,
 89 | 				 int verbose=1):
 90 | 
 91 | 		self.alpha= alpha
 92 | 		self.L2= L2
 93 | 		self.D= D
 94 | 		self.D_nn= D_nn
 95 | 		self.init_nn= init_nn
 96 | 		self.e_noise= e_noise
 97 | 		self.e_clip= e_clip
 98 | 		self.iters= iters
 99 | 		if threads==0:  threads= multiprocessing.cpu_count()-1
100 | 		self.threads= threads
101 | 		if inv_link=="sigmoid":  self.inv_link= 1
102 | 		if inv_link=="identity":  self.inv_link= 0
103 | 		self.seed = seed
104 | 		self.verbose= verbose
105 | 		self.reset()
106 | 
107 | 	def reset(self):
108 | 		init_nn= self.init_nn
109 | 		D= self.D
110 | 		D_nn= self.D_nn
111 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
112 | 		self.w0 = (rand.random((D + 1) * D_nn) - 0.5) * init_nn
113 | 		self.w1 = (rand.random(D_nn + 1) - 0.5) * init_nn
114 | 		self.z = np.zeros((D_nn,), dtype=np.float64)
115 | 		self.c0 = np.zeros((D,), dtype=np.float64)
116 | 		self.c1 = np.zeros((D_nn,), dtype=np.float64)
117 | 
118 | 	def predict(self, X, int threads= 0):
119 | 		if threads==0:  threads= self.threads
120 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
121 | 		if X.shape[1] != self.D:
122 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
123 | 		return self.predict_f(X.data, X.indices, X.indptr, threads)
124 | 
125 | 	def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
126 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
127 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads):
128 | 		cdef double alpha= self.alpha, L2= self.L2
129 | 		p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64)
130 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0]
131 | 		cdef double[:] pp= p
132 | 		cdef int lenn, D= self.D, D_nn= self.D_nn, row_count= X_indptr.shape[0]-1, row, ptr
133 | 		for row in range(row_count):
134 | 			ptr= X_indptr[row]
135 | 			lenn= X_indptr[row + 1] - ptr
136 | 			inds= <int*> X_indices.data + ptr
137 | 			vals= <double*> X_data.data + ptr
138 | 			pp[row]= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, w0, w1, z, threads), self.inv_link)
139 | 		return p
140 | 
141 | 	def partial_fit(self, X, y, sample_weight= None, int threads = 0, int seed = 0):
142 | 		return self.fit(X, y, sample_weight= sample_weight, threads = threads, seed = seed, reset= False)
143 | 
144 | 	def fit(self, X, y, sample_weight= None, int threads= 0, int seed= 0, reset= True):
145 | 		if threads == 0:  threads= self.threads
146 | 		if type(X) != ssp.csr.csr_matrix:  X = ssp.csr_matrix(X, dtype=np.float64)
147 | 		if reset or self.D==0:
148 | 			self.D= X.shape[1]
149 | 			self.reset()
150 | 		elif X.shape[1] != self.D:
151 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
152 | 		if type(y) != np.array:  y = np.array(y, dtype=np.float64)
153 | 		# self.fit_f(X, np.ascontiguousarray(X.data), np.ascontiguousarray(X.indices),
154 | 		#           np.ascontiguousarray(X.indptr), y, threads)
155 | 		if sample_weight is not None and type(sample_weight) != np.array:
156 | 			sample_weight= np.array(sample_weight, dtype=np.float64)
157 | 		return self.fit_f(X.data, X.indices, X.indptr, y, sample_weight, threads, seed)
158 | 
159 | 	def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
160 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
161 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr,
162 | 					np.ndarray[double, ndim=1, mode='c'] y,
163 | 					sample_weight,
164 | 					int threads, int seed):
165 | 		cdef double alpha= self.alpha, L2= self.L2, e_noise= self.e_noise, e, e_total= 0, e_clip= self.e_clip, abs_e
166 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0], *c0= &self.c0[0], *c1= &self.c1[0]
167 | 		cdef double *ys= <double*> y.data
168 | 		cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, ptr, row_count= X_indptr.shape[0]-1, row, \
169 | 																			inv_link= self.inv_link, j=0, jj
170 | 		cdef int* inds, indptr
171 | 		cdef double* vals
172 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
173 | 		for iter in range(self.iters):
174 | 			e_total= 0.0
175 | 			for row in range(row_count):
176 | 				ptr= X_indptr[row]
177 | 				lenn= X_indptr[row+1]-ptr
178 | 				inds= <int*> X_indices.data+ptr
179 | 				vals= <double*> X_data.data+ptr
180 | 				e= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, w0, w1, z, threads), self.inv_link) -ys[row]
181 | 				abs_e= fabs(e)
182 | 				e_total+= abs_e
183 | 				e += (rand.random() - 0.5) * e_noise
184 | 				if abs_e> e_clip:
185 | 					if e>0:  e= e_clip
186 | 					else:  e= -e_clip
187 | 				if sample_weight is not None:
188 | 					e*= sample_weight[row]
189 | 				update_single(inds, vals, lenn, D, D_nn, e, alpha, L2, w0, w1, z, c0, c1, threads)
190 | 			if self.verbose > 0:  print "Total e:", e_total
191 | 		return self
192 | 
193 | 	def predict_layer(self, X, int layer, int threads= 0):
194 | 		if threads==0:  threads= self.threads
195 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
196 | 		return self.predict_layer_f(X.data, X.indices, X.indptr, layer, threads)
197 | 
198 | 	def predict_layer_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
199 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
200 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int layer, int threads):
201 | 		cdef double alpha= self.alpha, L2= self.L2
202 | 		p = np.zeros(((X_indptr.shape[0] - 1), self.D_nn), dtype=np.float64)
203 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *z= &self.z[0]
204 | 		cdef double[:,:] pp= p
205 | 		cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, row_count= X_indptr.shape[0]-1, row, ptr
206 | 		for row in range(row_count):
207 | 			ptr= X_indptr[row]
208 | 			lenn= X_indptr[row + 1] - ptr
209 | 			inds= <int*> X_indices.data + ptr
210 | 			vals= <double*> X_data.data + ptr
211 | 			predict_single(inds, vals, lenn, D, D_nn, w0, w1, &pp[row][0], threads)
212 | 		return p
213 | 
214 | 	def __getstate__(self):
215 | 		return (self.alpha,
216 | 			self.L2,
217 | 			self.e_noise,
218 | 			self.e_clip,
219 | 			self.init_nn,
220 | 			self.D,
221 | 			self.D_nn,
222 | 			self.iters,
223 | 			self.threads,
224 | 			np.asarray(self.w0),
225 | 			np.asarray(self.w1),
226 | 			np.asarray(self.z),
227 | 			np.asarray(self.c0),
228 | 			np.asarray(self.c1),
229 | 			self.inv_link,
230 | 			self.seed,
231 | 			self.verbose)
232 | 
233 | 	def __setstate__(self, params):
234 | 		(self.alpha,
235 | 		 self.L2,
236 | 		 self.e_noise,
237 | 		 self.e_clip,
238 | 		 self.init_nn,
239 | 		 self.D,
240 | 		 self.D_nn,
241 | 		 self.iters,
242 | 		 self.threads,
243 | 		 self.w0,
244 | 		 self.w1,
245 | 		 self.z,
246 | 		 self.c0,
247 | 		 self.c1,
248 | 		 self.inv_link,
249 | 		 self.seed,
250 | 		 self.verbose)= params
251 | 


--------------------------------------------------------------------------------
/wordbatch/models/nn_relu_h2.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False, wraparound=False, cdivision=True
  2 | import numpy as np
  3 | import gzip
  4 | cimport cython
  5 | from cpython cimport array
  6 | import scipy.sparse as ssp
  7 | cimport numpy as np
  8 | from cython.parallel import prange
  9 | from libc.math cimport exp, log, fmax, fmin, sqrt, fabs
 10 | import multiprocessing
 11 | import sys
 12 | import randomgen
 13 | 
 14 | np.import_array()
 15 | 
 16 | cdef double inv_link_f(double e, int inv_link) nogil:
 17 | 	if inv_link==1:  return 1.0 / (1.0 + exp(-fmax(fmin(e, 35.0), -35.0))) #Sigmoid + logloss
 18 | 	return e
 19 | 
 20 | cdef double predict_single(int* inds, double* vals, int lenn,  int D,  int D_nn,  int D_nn2,
 21 | 				double* w0, double* w1, double* w2, double* z1, double* z2, int threads) nogil:
 22 | 	cdef int i, ii, j, k, DD_nn= D*D_nn, DD_nn2= D_nn*D_nn2
 23 | 	cdef double p, v, z1j, z2k
 24 | 	p= w2[D_nn2]
 25 | 	for k in prange(D_nn2, nogil=True, num_threads= threads):
 26 | 	#for k in range(D_nn2):
 27 | 		z2k= w1[DD_nn2+k]
 28 | 		for j in range(D_nn):
 29 | 			z1j= w0[DD_nn+j]
 30 | 			for ii in range(lenn):	z1j= z1j+ w0[inds[ii]*D_nn+j] * vals[ii]
 31 | 			if z1j<0: z1[j]= 0
 32 | 			else:
 33 | 				z1[j] = z1j
 34 | 				z2k= z2k+ w1[j * D_nn2+k] * z1j
 35 | 		if z2k<0:  z2[k]= 0
 36 | 		else:
 37 | 			z2[k]= z2k
 38 | 			p+= w2[k] * z2k
 39 | 	return p
 40 | 
 41 | cdef void update_single(int* inds, double* vals, int lenn,  int D,  int D_nn,  int D_nn2,
 42 | 						double e, double alpha, double L2, double* w0, double* w1, double* w2, double* z1,
 43 | 						double* z2, double* c0, double* c1, double* c2, int threads) nogil:
 44 | 	cdef int i, ii, j, k, DD_nn= D*D_nn, DD_nn2= D_nn*D_nn2
 45 | 	cdef double dldy= e, dldz1, dldz2, dldw0, dldw1, dldw2
 46 | 	w2[D_nn2]-= (dldy + L2 *w2[D_nn2]) * alpha
 47 | 	for k in range(D_nn2):
 48 | 		if z2[k]==0:  continue
 49 | 		dldw2= dldy * z2[k]
 50 | 		w2[k]-= (dldw2 + L2 * w2[k]) * alpha / (sqrt(c2[k])+1)
 51 | 		dldz2= dldy * w2[k]
 52 | 		w1[DD_nn2+k]-= (dldz2 + L2 * w1[DD_nn2 + k]) * alpha / (sqrt(c2[k])+1)
 53 | 		for j in range(D_nn):
 54 | 			if z1[j]==0:  continue
 55 | 			dldw1= dldz2 * z1[j]
 56 | 			w1[j*D_nn2+k]-= (dldw1 + L2 * w1[j]) * alpha / (sqrt(c1[j])+1)
 57 | 			dldz1= dldz2 * w1[j*D_nn2+k]
 58 | 			w0[DD_nn+j]-= (dldz1 + L2 * w0[DD_nn+j]) * alpha /  (sqrt(c1[j])+1)
 59 | 			for ii in prange(lenn, nogil=True, num_threads= threads):
 60 | 				i= inds[ii]
 61 | 				dldw0= dldz1 * vals[ii]
 62 | 				w0[i*D_nn+j]-= (dldw0 + L2 * w0[i * D_nn + j]) * alpha/(sqrt(c0[i])+1)
 63 | 				c0[i] += fabs(dldw0)
 64 | 			c1[j] += fabs(dldw1)
 65 | 		c2[k] += fabs(dldw2)
 66 | 
 67 | cdef class NN_ReLU_H2:
 68 | 	cdef const double[:] w0
 69 | 	cdef const double[:] w1
 70 | 	cdef const double[:] w2
 71 | 	cdef const double[:] z1
 72 | 	cdef const double[:] z2
 73 | 	cdef const double[:] c0
 74 | 	cdef const double[:] c1
 75 | 	cdef const double[:] c2
 76 | 
 77 | 	cdef unsigned int threads
 78 | 	cdef unsigned int iters
 79 | 	cdef int D
 80 | 	cdef int D_nn
 81 | 	cdef int D_nn2
 82 | 	cdef double init_nn
 83 | 
 84 | 	cdef double L2
 85 | 	cdef double alpha
 86 | 	cdef double e_noise
 87 | 	cdef double e_clip
 88 | 	cdef int inv_link
 89 | 	cdef int seed
 90 | 	cdef int verbose
 91 | 
 92 | 	def __init__(self,
 93 | 				 double alpha=0.1,
 94 | 				 double L2=0.00001,
 95 | 				 int D=0,
 96 | 				 int D_nn=12,
 97 | 				 int D_nn2=4,
 98 | 				 double init_nn=0.01,
 99 | 				 double e_noise=0.001,
100 | 				 double e_clip=1.0,
101 | 				 unsigned int iters=3,
102 | 				 inv_link= "identity",
103 | 				 int threads= 0,
104 | 				 int seed= 0,
105 | 				 int verbose=1):
106 | 
107 | 		self.alpha= alpha
108 | 		self.L2= L2
109 | 		self.e_noise= e_noise
110 | 		self.D= D
111 | 		self.D_nn= D_nn
112 | 		self.D_nn2= D_nn2
113 | 		self.init_nn= init_nn
114 | 		self.e_noise = e_noise
115 | 		self.e_clip = e_clip
116 | 		self.iters= iters
117 | 		if threads==0:  threads= multiprocessing.cpu_count()-1
118 | 		self.threads= threads
119 | 		if inv_link=="sigmoid":  self.inv_link= 1
120 | 		if inv_link=="identity":  self.inv_link= 0
121 | 		self.seed = seed
122 | 		self.verbose = verbose
123 | 		self.reset()
124 | 
125 | 	def reset(self):
126 | 		init_nn= self.init_nn
127 | 		D= self.D
128 | 		D_nn = self.D_nn
129 | 		D_nn2 = self.D_nn2
130 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
131 | 		self.w0 = (rand.random((D + 1) * D_nn) - 0.5) * init_nn
132 | 		self.w1 = (rand.random((D_nn + 1) * D_nn2) - 0.3) * init_nn
133 | 		self.w2 = (rand.rand(D_nn2 + 1) - 0.5) * init_nn
134 | 		self.z1= np.zeros((D_nn,), dtype=np.float64)
135 | 		self.z2= np.zeros((D_nn2,), dtype=np.float64)
136 | 		self.c0= np.zeros((D,), dtype=np.float64)
137 | 		self.c1= np.zeros((D_nn,), dtype=np.float64)
138 | 		self.c2= np.zeros((D_nn2,), dtype=np.float64)
139 | 
140 | 	def predict(self, X, int threads= 0):
141 | 		if threads==0:  threads= self.threads
142 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
143 | 		if X.shape[1] != self.D:
144 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
145 | 		return self.predict_f(X.data, X.indices, X.indptr, threads)
146 | 
147 | 	def predict_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
148 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
149 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int threads):
150 | 		cdef double alpha= self.alpha, L2= self.L2
151 | 		p= np.zeros(X_indptr.shape[0]-1, dtype= np.float64)
152 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0]
153 | 		cdef double[:] pp= p
154 | 		cdef int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, row_count= X_indptr.shape[0]-1, row, ptr
155 | 		for row in range(row_count):
156 | 			ptr= X_indptr[row]
157 | 			lenn= X_indptr[row + 1] - ptr
158 | 			inds= <int*> X_indices.data + ptr
159 | 			vals= <double*> X_data.data + ptr
160 | 			pp[row]= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, z2, threads), \
161 | 					self.inv_link)
162 | 		return p
163 | 
164 | 	def partial_fit(self, X, y, int threads = 0, int seed = 0):
165 | 		return self.fit(X, y, threads=threads, seed=seed, reset=False)
166 | 
167 | 	def fit(self, X, y, int threads= 0, int seed= 0, reset=True):
168 | 		if threads == 0:  threads= self.threads
169 | 		if type(X) != ssp.csr.csr_matrix:  X = ssp.csr_matrix(X, dtype=np.float64)
170 | 		if reset or self.D==0:
171 | 			self.D= X.shape[1]
172 | 			self.reset()
173 | 		elif X.shape[1] != self.D:
174 | 			print("Dimension mismatch! self.D=", self.D, "X.shape[1]=", X.shape[1])
175 | 		if type(y) != np.array:  y = np.array(y, dtype=np.float64)
176 | 		return self.fit_f(X.data, X.indices, X.indptr, y, threads, seed)
177 | 
178 | 	def fit_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
179 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
180 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr,
181 | 					np.ndarray[double, ndim=1, mode='c'] y, int threads, int seed):
182 | 		cdef double alpha= self.alpha, L2= self.L2, e_noise= self.e_noise, e, e_total= 0, e_clip= self.e_clip, abs_e
183 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0], \
184 | 					*c0= &self.c0[0], *c1= &self.c1[0], *c2= &self.c2[0]
185 | 		cdef double *ys= <double*> y.data
186 | 		cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, ptr, row_count= X_indptr.shape[0]-1, \
187 | 							row, inv_link= self.inv_link, j=0, jj
188 | 		cdef int* inds, indptr
189 | 		cdef double* vals
190 | 		rand= np.random.Generator(randomgen.xoroshiro128.Xoroshiro128(seed= self.seed))
191 | 		for iter in range(self.iters):
192 | 			e_total= 0.0
193 | 			for row in range(row_count):
194 | 				ptr= X_indptr[row]
195 | 				lenn= X_indptr[row+1]-ptr
196 | 				inds= <int*> X_indices.data+ptr
197 | 				vals= <double*> X_data.data+ptr
198 | 				e= inv_link_f(predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, z2, threads), \
199 | 							  self.inv_link) -ys[row]
200 | 				abs_e= fabs(e)
201 | 				e_total+= abs_e
202 | 				e += (rand.rand() - 0.5) * e_noise
203 | 				if abs_e> e_clip:
204 | 					if e>0:  e= e_clip
205 | 					else:  e= -e_clip
206 | 				update_single(inds, vals, lenn, D, D_nn, D_nn2, e, alpha, L2, w0, w1, w2, z1, z2, c0, c1, c2, threads)
207 | 			if self.verbose > 0:  print "Total e:", e_total
208 | 		return self
209 | 
210 | 	def predict_layer(self, X, int layer, int threads= 0):
211 | 		if threads==0:  threads= self.threads
212 | 		if type(X) != ssp.csr.csr_matrix:  X= ssp.csr_matrix(X, dtype=np.float64)
213 | 		return self.predict_layer_f(X.data, X.indices, X.indptr, layer, threads)
214 | 
215 | 	def predict_layer_f(self, np.ndarray[double, ndim=1, mode='c'] X_data,
216 | 					np.ndarray[int, ndim=1, mode='c'] X_indices,
217 | 					np.ndarray[int, ndim=1, mode='c'] X_indptr, int layer, int threads):
218 | 		cdef double alpha= self.alpha, L2= self.L2
219 | 		cdef double *w0= &self.w0[0], *w1= &self.w1[0], *w2= &self.w2[0], *z1= &self.z1[0], *z2= &self.z2[0]
220 | 		cdef unsigned int lenn, D= self.D, D_nn= self.D_nn, D_nn2= self.D_nn2, row_count= X_indptr.shape[0]-1, row, ptr
221 | 		if layer==1: p= np.zeros(((X_indptr.shape[0]-1),self.D_nn), dtype= np.float64)
222 | 		else:  p= np.zeros(((X_indptr.shape[0]-1),self.D_nn2), dtype= np.float64)
223 | 		cdef double[:,:] pp= p
224 | 		for row in range(row_count):
225 | 			ptr= X_indptr[row]
226 | 			lenn= X_indptr[row + 1] - ptr
227 | 			inds= <int*> X_indices.data + ptr
228 | 			vals= <double*> X_data.data + ptr
229 | 			if layer==1:  predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, &pp[row][0], z2, threads)
230 | 			else:  predict_single(inds, vals, lenn, D, D_nn, D_nn2, w0, w1, w2, z1, &pp[row][0], threads)
231 | 		return p
232 | 
233 | 	def __getstate__(self):
234 | 		return (self.alpha,
235 | 				self.L2,
236 | 				self.e_noise,
237 | 				self.e_clip,
238 | 				self.init_nn,
239 | 				self.D,
240 | 				self.D_nn,
241 | 				self.D_nn2,
242 | 				self.iters,
243 | 				self.threads,
244 | 				np.asarray(self.w1),
245 | 				np.asarray(self.w2),
246 | 				np.asarray(self.z1),
247 | 				np.asarray(self.z2),
248 | 				np.asarray(self.c0),
249 | 				np.asarray(self.c1),
250 | 				np.asarray(self.c2),
251 | 				self.inv_link,
252 | 				self.seed,
253 | 				self.verbose)
254 | 
255 | 	def __setstate__(self, params):
256 | 		(self.alpha,
257 | 		 self.L2,
258 | 		 self.e_noise,
259 | 		 self.e_clip,
260 | 		 self.init_nn,
261 | 		 self.D,
262 | 		 self.D_nn,
263 | 		 self.D_nn2,
264 | 		 self.iters,
265 | 		 self.threads,
266 | 		 self.w1,
267 | 		 self.w2,
268 | 		 self.z1,
269 | 		 self.z2,
270 | 		 self.c0,
271 | 		 self.c1,
272 | 		 self.c2,
273 | 		 self.inv_link,
274 | 		 self.seed,
275 | 		 self.verbose) = params
276 | 


--------------------------------------------------------------------------------
/wordbatch/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .apply import Apply, decorator_apply
2 | from .apply_batch import ApplyBatch, decorator_apply_batch
3 | from .apply_groupby import ApplyGroupBy, decorator_apply_groupby
4 | from .batch_transformer import BatchTransformer
5 | from .feature_union import FeatureUnion
6 | from .wordbatch import WordBatch


--------------------------------------------------------------------------------
/wordbatch/pipelines/apply.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | from __future__ import with_statement
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from __future__ import print_function
 6 | import pandas as pd
 7 | import wordbatch.batcher
 8 | 
 9 | def decorator_apply(func, batcher=None, cache=None, vectorize=None):
10 | 	def wrapper_func(*args, **kwargs):
11 | 		return Apply(func, args=args[1:], kwargs=kwargs, batcher=batcher, cache=cache, vectorize=vectorize)\
12 | 			.transform(args[0])
13 | 	return wrapper_func
14 | 
15 | def batch_transform(args):
16 | 	f= args[1]
17 | 	f_args= args[2]
18 | 	f_kwargs= args[3]
19 | 	if args[5] is not None:
20 | 		from numba import vectorize
21 | 		return vectorize(args[5], fastmath=True)(f)(*zip(*args[0]))
22 | 	if args[4] is not None:
23 | 		from functools import lru_cache
24 | 		f= lru_cache(maxsize=args[4])(f)
25 | 	#Applying per DataFrame row is very slow, use ApplyBatch instead
26 | 	if isinstance(args[0], pd.DataFrame):  return args[0].apply(lambda x: f(x, *f_args, **f_kwargs), axis=1)
27 | 	return [f(row, *f_args, **f_kwargs) for row in args[0]]
28 | 
29 | class Apply(object):
30 | 	#Applies a function to each row of a minibatch
31 | 	def __init__(self, function, batcher=None, args=[], kwargs={}, cache=None, vectorize=None):
32 | 		if batcher is None:   self.batcher= wordbatch.batcher.Batcher()
33 | 		else:  self.batcher= batcher
34 | 		self.function= function
35 | 		self.args= [args]
36 | 		self.kwargs= [kwargs]
37 | 		self.cache = [cache]
38 | 		self.vectorize = [vectorize]
39 | 
40 | 	def fit(self, data, input_split= False, batcher= None):
41 | 		return self
42 | 
43 | 	def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
44 | 		return self.transform(data, input_split, merge_output, minibatch_size, batcher)
45 | 
46 | 	def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
47 | 		if batcher is None:  batcher = self.batcher
48 | 		return batcher.process_batches(batch_transform, data,
49 | 		                               [self.function] + self.args + self.kwargs + self.cache + self.vectorize,
50 | 		                               input_split=input_split, merge_output=merge_output,
51 | 		                               minibatch_size= minibatch_size)
52 | 
53 | # import wordbatch.batcher as batcher
54 | # b= batcher.Batcher(minibatch_size=2)#, method="serial")
55 | # import numpy as np
56 | # a= Apply(np.power, b, [2],{})
57 | # print(a.transform([1, 2, 3, 4]))


--------------------------------------------------------------------------------
/wordbatch/pipelines/apply_batch.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | from __future__ import with_statement
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from __future__ import print_function
 6 | import wordbatch.batcher
 7 | 
 8 | def decorator_apply_batch(func, batcher=None):
 9 | 	def wrapper_func(*args, **kwargs):
10 | 		return ApplyBatch(func, args=args[1:], kwargs= kwargs, batcher= batcher).transform(args[0])
11 | 	return wrapper_func
12 | 
13 | def batch_transform(args):
14 | 	f= args[1]
15 | 	f_args= args[2]
16 | 	f_kwargs= args[3]
17 | 	return f(args[0], *f_args, **f_kwargs)
18 | 
19 | class ApplyBatch(object):
20 | 	#Applies a function to the entire minibatch. Use this for example on Pandas dataframes, to avoid per-row overhead.
21 | 	#Function needs to be applicable to the array/list of values!
22 | 	#If not, modify/wrap the function to process a list, or use Apply
23 | 	def __init__(self, function, batcher=None, args=[], kwargs={}):
24 | 		if batcher is None:  self.batcher = wordbatch.batcher.Batcher()
25 | 		else:  self.batcher = batcher
26 | 		self.function= function
27 | 		self.args= [args]
28 | 		self.kwargs= [kwargs]
29 | 
30 | 	def fit(self, data, input_split= False):
31 | 		return self
32 | 
33 | 	def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
34 | 		return self.transform(data, input_split, merge_output, minibatch_size, batcher)
35 | 
36 | 	def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
37 | 		if batcher is None:  batcher = self.batcher
38 | 		return batcher.process_batches(batch_transform, data, [self.function] + self.args + self.kwargs,
39 | 		                                    input_split=input_split, merge_output=merge_output,
40 | 		                                    minibatch_size= minibatch_size)
41 | 
42 | # import wordbatch.batcher as batcher
43 | # b= batcher.Batcher(minibatch_size=2)#, method="serial")
44 | # import numpy as np
45 | # a= ApplyBatch(np.power, b, [2],{})
46 | # print(a.transform([1, 2, 3, 4]))


--------------------------------------------------------------------------------
/wordbatch/pipelines/apply_groupby.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | import pandas as pd
 3 | from wordbatch.pipelines import Apply
 4 | import wordbatch.batcher
 5 | 
 6 | def decorator_apply_groupby(func, group, batcher=None, rows_per_bin=200, cache=None, vectorize=None):
 7 | 	def wrapper_func(*args, **kwargs):
 8 | 		return ApplyGroupBy(func, args=args[1:], kwargs=kwargs, group=group, rows_per_bin=rows_per_bin,
 9 | 							batcher=batcher, cache=cache, vectorize=vectorize).transform(args[0])
10 | 	return wrapper_func
11 | 
12 | class ApplyGroupBy(object):
13 | 	def __init__(self, function, group, batcher=None, rows_per_bin= 200, cache=None, vectorize=None, args=[],
14 | 				 kwargs={}):
15 | 		if batcher is None:   self.batcher= wordbatch.batcher.Batcher()
16 | 		else:  self.batcher= batcher
17 | 		self.function= function
18 | 		self.group= group
19 | 		self.rows_per_bin = rows_per_bin
20 | 		self.cache= cache
21 | 		self.vectorize= vectorize
22 | 		self.args= [args]
23 | 		self.kwargs= [kwargs]
24 | 
25 | 	def fit(self, data, input_split= False):
26 | 		return self
27 | 
28 | 	def fit_transform(self, data, input_split= False, merge_output= True):
29 | 		return self.transform(data, input_split, merge_output)
30 | 
31 | 	def transform(self, data, input_split= False, merge_output= True):
32 | 		bin_ids = data[self.group].unique()
33 | 		group_bins= {x:1 for x in bin_ids} if len(bin_ids) <= self.rows_per_bin else \
34 | 			{x[0]: x[1] for x in zip(bin_ids, pd.qcut(bin_ids, len(bin_ids) // self.rows_per_bin))}
35 | 		group_bin_col = data[self.group].map(group_bins)
36 | 		bin_ids, groups = zip(*data.groupby(group_bin_col, as_index=False))
37 | 		t= [x for x in Apply(self.function, self.batcher, *self.args, *self.kwargs, self.cache,
38 | 							 self.vectorize).transform(groups, input_split, merge_output)
39 | 			if len(x) > 0]
40 | 		try:
41 | 			t= pd.concat(t, sort=False) # t is Series or DataFrame
42 | 		except:
43 | 			t= [item for sublist in t for item in sublist] # t is some iterable
44 | 		return t


--------------------------------------------------------------------------------
/wordbatch/pipelines/batch_transformer.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | from __future__ import with_statement
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from __future__ import print_function
 6 | import wordbatch.batcher
 7 | 
 8 | def batch_transform(args):
 9 | 	return args[1].transform(args[0])
10 | 
11 | class BatchTransformer(object):
12 | 	def __init__(self, transformer, call_fit=True, batcher=None):
13 | 		if batcher is None:  self.batcher = wordbatch.batcher.Batcher()
14 | 		else:  self.batcher = batcher
15 | 		self.transformer= transformer
16 | 		self.call_fit= call_fit
17 | 
18 | 	def fit(self, data, input_split=False, batcher=None):
19 | 		if batcher is None:  batcher = self.batcher
20 | 		if self.call_fit:
21 | 			if input_split:  self.transformer.fit(batcher.merge_batches(batcher.collect_batches(data)))
22 | 			else:  self.transformer.fit(data)
23 | 		return self
24 | 
25 | 	def fit_transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
26 | 		if self.call_fit:  self.fit(data, input_split= input_split)
27 | 		return self.transform(data, input_split, merge_output, minibatch_size= minibatch_size, batcher=batcher)
28 | 
29 | 	def transform(self, data, input_split=False, merge_output=True, minibatch_size=None, batcher=None):
30 | 		if batcher is None:  batcher = self.batcher
31 | 		return batcher.process_batches(batch_transform, data, [self.transformer],
32 | 		                               input_split=input_split, merge_output=merge_output,
33 | 		                               minibatch_size= minibatch_size)
34 | 


--------------------------------------------------------------------------------
/wordbatch/pipelines/feature_union.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing.pool import Pool
  2 | 
  3 | import numpy as np
  4 | from scipy import sparse
  5 | from sklearn.base import TransformerMixin
  6 | from sklearn.pipeline import FeatureUnion, _fit_one, _fit_transform_one, _transform_one, _name_estimators
  7 | from sklearn.utils.metaestimators import _BaseComposition
  8 | from wordbatch.pipelines import Apply
  9 | 
 10 | def fit_one(args):
 11 | 	X, y, transformer, fit_params = args
 12 | 	return transformer.fit(X, y, **fit_params)
 13 | 
 14 | def transform_one(args):
 15 | 	X, transformer= args
 16 | 	return transformer.transform(X)
 17 | 
 18 | def fit_transform_one(args):
 19 | 	X, y, transformer, fit_params = args
 20 | 	return transformer.fit_transform(X, y, **fit_params), transformer
 21 | 
 22 | class FeatureUnion(_BaseComposition, TransformerMixin):
 23 | 	"""Concatenates results of multiple transformer objects.
 24 | 
 25 | 	This estimator applies a list of transformer objects in parallel to the
 26 | 	input data, then concatenates the results. This is useful to combine
 27 | 	several feature extraction mechanisms into a single transformer.
 28 | 
 29 | 	Parameters of the transformers may be set using its name and the parameter
 30 | 	name separated by a '__'. A transformer may be replaced entirely by
 31 | 	setting the parameter with its name to another transformer,
 32 | 	or removed by setting to 'drop' or ``None``.
 33 | 
 34 | 	Read more in the :ref:`User Guide <feature_union>`.
 35 | 
 36 | 	Parameters
 37 | 	----------
 38 | 	transformer_list : list of (string, transformer) tuples
 39 | 	    List of transformer objects to be applied to the data. The first
 40 | 	    half of each tuple is the name of the transformer.
 41 | 
 42 | 	n_jobs : int or None, optional (default=None)
 43 | 	    Number of jobs to run in parallel.
 44 | 	    ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
 45 | 	    ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
 46 | 	    for more details.
 47 | 
 48 | 	transformer_weights : dict, optional
 49 | 	    Multiplicative weights for features per transformer.
 50 | 	    Keys are transformer names, values the weights.
 51 | 
 52 | 	verbose : boolean, optional(default=False)
 53 | 	    If True, the time elapsed while fitting each transformer will be
 54 | 	    printed as it is completed.
 55 | 
 56 | 	See also
 57 | 	--------
 58 | 	sklearn.pipeline.make_union : convenience function for simplified
 59 | 	    feature union construction.
 60 | 
 61 | 	Examples
 62 | 	--------
 63 | 	>>> from sklearn.pipeline import FeatureUnion
 64 | 	>>> from sklearn.decomposition import PCA, TruncatedSVD
 65 | 	>>> union = FeatureUnion([("pca", PCA(n_components=1)),
 66 | 	...                       ("svd", TruncatedSVD(n_components=2))])
 67 | 	>>> X = [[0., 1., 3], [2., 2., 5]]
 68 | 	>>> union.fit_transform(X)
 69 | 	array([[ 1.5       ,  3.0...,  0.8...],
 70 | 	       [-1.5       ,  5.7..., -0.4...]])
 71 | 	"""
 72 | 	_required_parameters = ["transformer_list"]
 73 | 
 74 | 	def __init__(self, transformer_list, transformer_weights=None, batcher=None, concatenate=True):
 75 | 		self.transformer_list = transformer_list
 76 | 		self.transformer_weights = transformer_weights
 77 | 		self._validate_transformers()
 78 | 		self.batcher = batcher
 79 | 		self.concatenate= concatenate
 80 | 
 81 | 	def get_params(self, deep=True):
 82 | 		"""Get parameters for this estimator.
 83 | 
 84 | 		Parameters
 85 | 		----------
 86 | 		deep : boolean, optional
 87 | 			If True, will return the parameters for this estimator and
 88 | 			contained subobjects that are estimators.
 89 | 
 90 | 		Returns
 91 | 		-------
 92 | 		params : mapping of string to any
 93 | 			Parameter names mapped to their values.
 94 | 		"""
 95 | 		return self._get_params('transformer_list', deep=deep)
 96 | 
 97 | 	def set_params(self, **kwargs):
 98 | 		"""Set the parameters of this estimator.
 99 | 
100 | 		Valid parameter keys can be listed with ``get_params()``.
101 | 
102 | 		Returns
103 | 		-------
104 | 		self
105 | 		"""
106 | 		self._set_params('transformer_list', **kwargs)
107 | 		return self
108 | 
109 | 	def _validate_transformers(self):
110 | 		names, transformers = zip(*self.transformer_list)
111 | 
112 | 		# validate names
113 | 		self._validate_names(names)
114 | 
115 | 		# validate estimators
116 | 		for t in transformers:
117 | 			if t is None or t == 'drop':
118 | 				continue
119 | 			if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
120 | 			hasattr(t, "transform")):
121 | 				raise TypeError("All estimators should implement fit and "
122 | 								"transform. '%s' (type %s) doesn't" %
123 | 								(t, type(t)))
124 | 
125 | 	def _iter(self):
126 | 		"""
127 | 		Generate (name, trans, weight) tuples excluding None and
128 | 		'drop' transformers.
129 | 		"""
130 | 		get_weight = (self.transformer_weights or {}).get
131 | 		return ((name, trans, get_weight(name))
132 | 				for name, trans in self.transformer_list
133 | 		        if trans is not None and trans != 'drop')
134 | 
135 | 	def get_feature_names(self):
136 | 		"""Get feature names from all transformers.
137 | 
138 | 		Returns
139 | 		-------
140 | 		feature_names : list of strings
141 | 			Names of the features produced by transform.
142 | 		"""
143 | 		feature_names = []
144 | 		for name, trans, weight in self._iter():
145 | 			if not hasattr(trans, 'get_feature_names'):
146 | 				raise AttributeError("Transformer %s (type %s) does not "
147 | 									 "provide get_feature_names."
148 | 									 % (str(name), type(trans).__name__))
149 | 			feature_names.extend([name + "__" + f for f in
150 | 								  trans.get_feature_names()])
151 | 		return feature_names
152 | 
153 | 	def fit(self, X, y=None, **fit_params):
154 | 		"""Fit all transformers using X.
155 | 
156 | 		Parameters
157 | 		----------
158 | 		X : iterable or array-like, depending on transformers
159 | 			Input data, used to fit transformers.
160 | 
161 | 		y : array-like, shape (n_samples, ...), optional
162 | 			Targets for supervised learning.
163 | 
164 | 		Returns
165 | 		-------
166 | 		self : FeatureUnion
167 | 			This estimator
168 | 		"""
169 | 		self.transformer_list = list(self.transformer_list)
170 | 		self._validate_transformers()
171 | 		paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, y, t, fit_params]
172 | 		                for _, t, _ in self._iter()]
173 | 		transformers= Apply(fit_one, self.batcher).transform(paral_params)
174 | 		# with Pool(self.n_jobs) as pool:
175 | 		# 	transformers = pool.starmap(_fit_one,
176 | 		# 				((trans, X[trans['col_pick']] if hasattr(trans, 'col_pick') else X, y) for _, trans, _ in self._iter()))
177 | 		self._update_transformer_list(transformers)
178 | 		return self
179 | 
180 | 	def fit_transform(self, X, y=None, **fit_params):
181 | 		"""Fit all transformers, transform the data and concatenate results.
182 | 
183 | 		Parameters
184 | 		----------
185 | 		X : iterable or array-like, depending on transformers
186 | 			Input data to be transformed.
187 | 
188 | 		y : array-like, shape (n_samples, ...), optional
189 | 			Targets for supervised learning.
190 | 
191 | 		Returns
192 | 		-------
193 | 		X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
194 | 			hstack of results of transformers. sum_n_components is the
195 | 			sum of n_components (output dimension) over transformers.
196 | 		"""
197 | 		self._validate_transformers()
198 | 		paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, y, t, fit_params]
199 | 		                for _, t, _ in self._iter()]
200 | 		result = Apply(fit_transform_one, self.batcher).transform(paral_params)
201 | 		if not result:
202 | 			# All transformers are None
203 | 			return np.zeros((X.shape[0], 0))
204 | 		Xs, transformers = zip(*result)
205 | 		self._update_transformer_list(transformers)
206 | 		if self.concatenate:
207 | 			if any(sparse.issparse(f) for f in Xs):
208 | 				Xs = sparse.hstack(Xs).tocsr()
209 | 			else:
210 | 				Xs = np.hstack(Xs)
211 | 		return Xs
212 | 
213 | 	def transform(self, X):
214 | 		"""Transform X separately by each transformer, concatenate results.
215 | 
216 | 		Parameters
217 | 		----------
218 | 		X : iterable or array-like, depending on transformers
219 | 			Input data to be transformed.
220 | 
221 | 		Returns
222 | 		-------
223 | 		X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
224 | 			hstack of results of transformers. sum_n_components is the
225 | 			sum of n_components (output dimension) over transformers.
226 | 		"""
227 | 		paral_params = [[X[t['col_pick']] if hasattr(t, 'col_pick') else X, t] for _, t, _ in self._iter()]
228 | 		Xs = Apply(transform_one, self.batcher).transform(paral_params)
229 | 		if not Xs:
230 | 			# All transformers are None
231 | 			return np.zeros((X.shape[0], 0))
232 | 		if self.concatenate:
233 | 			if any(sparse.issparse(f) for f in Xs):
234 | 				Xs = sparse.hstack(Xs).tocsr()
235 | 			else:
236 | 				Xs = np.hstack(Xs)
237 | 		return Xs
238 | 
239 | 	def _update_transformer_list(self, transformers):
240 | 		transformers = iter(transformers)
241 | 		self.transformer_list[:] = [(name, None if old is None or old == 'drop' else next(transformers))
242 | 		                            for name, old in self.transformer_list
243 | 		]
244 | 
245 | 
246 | def make_union(*transformers, **kwargs):
247 | 	"""Construct a FeatureUnion from the given transformers.
248 | 
249 | 	This is a shorthand for the FeatureUnion constructor; it does not require,
250 | 	and does not permit, naming the transformers. Instead, they will be given
251 | 	names automatically based on their types. It also does not allow weighting.
252 | 
253 | 	Parameters
254 | 	----------
255 | 	*transformers : list of estimators
256 | 
257 | 	n_jobs : int, optional
258 | 		Number of jobs to run in parallel (default 1).
259 | 
260 | 	Returns
261 | 	-------
262 | 	f : FeatureUnion
263 | 
264 | 	Examples
265 | 	--------
266 | 	>>> from sklearn.decomposition import PCA, TruncatedSVD
267 | 	>>> from sklearn.pipeline import make_union
268 | 	>>> make_union(PCA(), TruncatedSVD())    # doctest: +NORMALIZE_WHITESPACE
269 | 	FeatureUnion(n_jobs=1,
270 | 		   transformer_list=[('pca',
271 | 							  PCA(copy=True, iterated_power='auto',
272 | 								  n_components=None, random_state=None,
273 | 								  svd_solver='auto', tol=0.0, whiten=False)),
274 | 							 ('truncatedsvd',
275 | 							  TruncatedSVD(algorithm='randomized',
276 | 							  n_components=2, n_iter=5,
277 | 							  random_state=None, tol=0.0))],
278 | 		   transformer_weights=None)
279 | 	"""
280 | 	n_jobs = kwargs.pop('n_jobs', 1)
281 | 	concatenate = kwargs.pop('concatenate', True)
282 | 	if kwargs:
283 | 	   # We do not currently support `transformer_weights` as we may want to
284 | 	   # change its type spec in make_union
285 | 	   raise TypeError('Unknown keyword arguments: "{}"'
286 | 					   .format(list(kwargs.keys())[0]))
287 | 	return FeatureUnion(_name_estimators(transformers), n_jobs= n_jobs, concatenate= concatenate)
288 | 


--------------------------------------------------------------------------------
/wordbatch/pipelines/wordbatch.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import with_statement
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from __future__ import print_function
  6 | import os
  7 | import wordbatch.batcher
  8 | import wordbatch.pipelines
  9 | import wordbatch.transformers
 10 | 
 11 | class WordBatch(object):
 12 | 	def __init__(self, normalize_text="new", dictionary="new", tokenizer=None, extractor=None,
 13 | 				 freeze=False, fit_extractor= False, batcher=None, verbose=0):
 14 | 		self.verbose= verbose
 15 | 		if batcher is None:  self.batcher= wordbatch.batcher.Batcher(verbose=verbose)
 16 | 		else:  self.batcher=  batcher
 17 | 
 18 | 		if normalize_text is None:  self.normalize_text= None
 19 | 		elif normalize_text== "new":  self.normalize_text= wordbatch.transformers.text_normalizer.TextNormalizer()
 20 | 		elif callable(normalize_text):
 21 | 			self.normalize_text= wordbatch.pipelines.apply.Apply(normalize_text, batcher=batcher)
 22 | 		else:  self.normalize_text= normalize_text
 23 | 
 24 | 		if tokenizer is None:  self.tokenizer= None
 25 | 		else:  self.tokenizer= tokenizer
 26 | 
 27 | 		if dictionary is None:  self.dictionary= None
 28 | 		elif dictionary== "new":  self.dictionary= wordbatch.transformers.dictionary.Dictionary()
 29 | 		else:  self.dictionary= dictionary
 30 | 
 31 | 		if extractor is None:  self.extractor= None
 32 | 		else:  self.extractor= wordbatch.pipelines.batch_transformer.BatchTransformer(extractor, batcher=batcher,
 33 | 		                                                                              call_fit=fit_extractor)
 34 | 		if self.extractor is not None:
 35 | 			if hasattr(self.extractor.transformer, "dictionary"):
 36 | 				self.extractor.transformer.dictionary = self.dictionary
 37 | 		self.freeze= freeze
 38 | 
 39 | 	def reset(self):
 40 | 		if self.dictionary is not None:  self.dictionary.reset()
 41 | 		return self
 42 | 
 43 | 	def process(self, data, y=None, input_split=False, reset=True, update=True, minibatch_size=None, batcher= None):
 44 | 		if batcher is None:  batcher = self.batcher
 45 | 		if reset:  self.reset()
 46 | 		if self.freeze:  update= False
 47 | 
 48 | 		if self.normalize_text is not None:
 49 | 			if self.verbose > 0:  print("Normalize text")
 50 | 			data= self.normalize_text.transform(data, input_split=input_split, merge_output=False,
 51 | 			                                    minibatch_size= minibatch_size, batcher= self.batcher)
 52 | 			input_split= True
 53 | 		if self.tokenizer is not None:
 54 | 			if self.verbose > 0:  print("Tokenize text")
 55 | 			if update:  data= self.tokenizer.fit_transform(data, y=y, input_split=input_split, merge_output=False,
 56 | 			                                               reset=reset, minibatch_size= minibatch_size,
 57 | 			                                               batcher=self.batcher)
 58 | 			else: data= self.tokenizer.transform(data, y=y, input_split=input_split, merge_output=False,
 59 | 			                                     minibatch_size= minibatch_size, batcher=self.batcher)
 60 | 			input_split= True
 61 | 		if self.dictionary is not None:
 62 | 			if self.verbose > 0:  print("Use dictionary")
 63 | 			if update:  data= self.dictionary.fit_transform(data, y=y, input_split=input_split, merge_output=False,
 64 | 				                                     reset=reset, minibatch_size= minibatch_size, batcher=self.batcher)
 65 | 			else: data= self.dictionary.transform(data, y=y, input_split=input_split, merge_output=False,
 66 | 			                                      minibatch_size= minibatch_size, batcher=self.batcher)
 67 | 		if self.verbose> 2: print("len(self.dictionary.dft):", len(self.dictionary.dft))
 68 | 		return data
 69 | 
 70 | 	def fit(self, data, y=None, input_split=False, reset=True, minibatch_size=None, batcher= None):
 71 | 		if batcher is None:  batcher = self.batcher
 72 | 		self.process(data, y, input_split, reset=reset, update= True, minibatch_size= minibatch_size, batcher= batcher)
 73 | 		if self.extractor is not None:
 74 | 			self.extractor.fit(data, input_split=input_split)
 75 | 		return self
 76 | 
 77 | 	def transform(self, data, y=None, cache_features=None, input_split=False, reset=False, update=False,
 78 | 	              minibatch_size=None, batcher= None):
 79 | 		if batcher is None:  batcher = self.batcher
 80 | 		if cache_features is not None:
 81 | 			if self.extractor is not None and os.path.exists(cache_features) and \
 82 | 					hasattr(self.extractor.transformer, "load_features"):
 83 | 				return self.extractor.transformer.load_features(cache_features)
 84 | 		if not(input_split):  data= batcher.split_batches(data, minibatch_size= minibatch_size)
 85 | 		data= self.process(data, y=y, input_split=True, reset=reset, update=update)
 86 | 		if self.extractor is not None:
 87 | 			if self.verbose > 0:  print("Extract features")
 88 | 			if update: data= self.extractor.fit_transform(data, input_split=True, merge_output=True)
 89 | 			else: data= self.extractor.transform(data, input_split=True, merge_output=True)
 90 | 			if cache_features is not None and hasattr(self.extractor.transformer, "load_features"):
 91 | 				self.extractor.transformer.save_features(cache_features, data)
 92 | 			return data
 93 | 		else:
 94 | 			return batcher.merge_batches(data, batcher.backend)
 95 | 
 96 | 	def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher= None):
 97 | 		if batcher is None:  batcher = self.batcher
 98 | 		return self.fit(data, y, input_split, reset=False, minibatch_size= minibatch_size, batcher=batcher)
 99 | 
100 | 	def fit_transform(self, data, y=None, cache_features=None, input_split=False, reset=True, minibatch_size=None,
101 | 	                  batcher=None):
102 | 		if batcher is None:  batcher = self.batcher
103 | 		return self.transform(data, y, cache_features, input_split, reset, update=True, minibatch_size= minibatch_size,
104 | 		                      batcher=batcher)
105 | 
106 | 	def partial_fit_transform(self, data, y=None, cache_features=None, input_split=False, minibatch_size=None,
107 | 	                          batcher= None):
108 | 		if batcher is None:  batcher = self.batcher
109 | 		return self.transform(data, y, cache_features, input_split, reset=False, update=True,
110 | 		                      minibatch_size= minibatch_size, batcher=batcher)
111 | 
112 | 	def __getstate__(self):
113 | 		return dict((k, v) for (k, v) in self.__dict__.items())
114 | 
115 | 	def __setstate__(self, params):
116 | 		for key in params:  setattr(self, key, params[key])
117 | 


--------------------------------------------------------------------------------
/wordbatch/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .dictionary import Dictionary
2 | from .tokenizer import Tokenizer
3 | from .text_normalizer import TextNormalizer
4 | 


--------------------------------------------------------------------------------
/wordbatch/transformers/dictionary.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import with_statement
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from __future__ import print_function
  6 | from collections import defaultdict
  7 | import operator
  8 | 
  9 | def batch_get_dfs(args):
 10 | 	dft= defaultdict(int)
 11 | 	for text in args[0]:
 12 | 		if type(text)==str:
 13 | 			for word in set(text.split(" ")):  dft[word]+= 1
 14 | 		else:
 15 | 			dft[str(text)]+= 1
 16 | 	return [dict(dft), len(args[0])]
 17 | 
 18 | class Dictionary(object):
 19 | 	def __init__(self, min_df=0, max_df=1.0, max_words= 10000000000000, freeze= False, encode=True, verbose=0):
 20 | 		self.verbose = verbose
 21 | 		self.freeze = freeze
 22 | 		self.max_words = max_words
 23 | 		self.min_df = min_df
 24 | 		self.max_df = max_df
 25 | 		self.encode= encode
 26 | 		self.word2id= None
 27 | 		self.reset()
 28 | 
 29 | 	def reset(self):
 30 | 		if self.encode:  self.word2id = {}
 31 | 		self.dft = {}
 32 | 		self.doc_count = 0
 33 | 		return self
 34 | 
 35 | 	def get_pruning_dft(self, dft):
 36 | 		sorted_dft = sorted(list(dft.items()), key=operator.itemgetter(1,0), reverse=True)
 37 | 		if type(self.min_df) == type(1):  min_df2 = self.min_df
 38 | 		else:  min_df2 = self.doc_count * self.min_df
 39 | 		if type(self.max_df) == type(1):   max_df2 = self.max_df
 40 | 		else:  max_df2 = self.doc_count * self.max_df
 41 | 		return sorted_dft, min_df2, max_df2
 42 | 
 43 | 	def prune_dictionary(self, max_words=None, min_df=None, max_df=None, re_encode= False, prune_dfs= True,
 44 | 						 set_max_words= True):
 45 | 		#Prune dictionary. Optionally prune document frequency table as well
 46 | 		if max_words is not None: self.max_words= max_words
 47 | 		if min_df is not None: self.min_df= min_df
 48 | 		if max_df is not None: self.max_df= max_df
 49 | 		max_words= self.max_words
 50 | 		word2id = self.word2id
 51 | 		dft = self.dft
 52 | 		sorted_dft, min_df2, max_df2 = self.get_pruning_dft(dft)
 53 | 		c= 0
 54 | 		#print(len(sorted_dft), len(self.word2id), len(self.raw_dft))
 55 | 		for word, df in sorted_dft:
 56 | 			if word2id is not None:
 57 | 				if word not in word2id:
 58 | 					if re_encode:  word2id[word]= -1
 59 | 					else:  continue
 60 | 			c+= 1
 61 | 			if c > max_words or df < min_df2 or df > max_df2:
 62 | 				if prune_dfs: dft.pop(word)
 63 | 				if word2id is not None:  word2id.pop(word)
 64 | 			elif re_encode:
 65 | 				if word2id is not None:  word2id[word]= c
 66 | 		if set_max_words and word2id is not None:  self.max_words= len(word2id)
 67 | 
 68 | 	def fit(self, data, y=None, input_split= False, reset= False, minibatch_size=None, batcher= None):
 69 | 		if reset:  self.reset()
 70 | 		if self.word2id is None:
 71 | 			self.word2id = {}
 72 | 		word2id= self.word2id
 73 | 		if batcher is None:  dfts, doc_counts= zip(*batch_get_dfs(data))
 74 | 		else:
 75 | 			# import wordbatch.pipelines
 76 | 			# dfts, doc_counts = zip(*batcher.collect_batches(
 77 | 			# 	wordbatch.pipelines.apply_batch.ApplyBatch(get_dfs, batcher=batcher).transform(
 78 | 			# 		data, input_split=input_split, merge_output=False)
 79 | 			# ))
 80 | 			dfts, doc_counts= zip(*batcher.collect_batches(
 81 | 				batcher.process_batches(batch_get_dfs, data, [], input_split= input_split, merge_output=False,
 82 | 				                        minibatch_size=minibatch_size)))
 83 | 		self.doc_count += sum(doc_counts)
 84 | 		dft = defaultdict(int, self.dft)
 85 | 		for dft2 in dfts:
 86 | 			for k, v in dft2.items():  dft[k] += v
 87 | 		if word2id is not None:
 88 | 			#Add entries. Online pruning only used to prevent inclusion into dictionary
 89 | 			sorted_dft, min_df2, max_df2 = self.get_pruning_dft(dft)
 90 | 			for word, df in sorted_dft:
 91 | 				if len(word2id)>= self.max_words: break
 92 | 				if df<min_df2 or df>max_df2: continue
 93 | 				if word in word2id:  continue
 94 | 				word2id[word] = len(word2id)+1
 95 | 				if self.verbose>2: print("Add word to dictionary:", word, dft[word], word2id[word])
 96 | 		self.dft= dict(dft)
 97 | 		return self
 98 | 
 99 | 	def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher=None):
100 | 		return self.fit(data, y, input_split, reset=False, minibatch_size=minibatch_size, batcher=batcher)
101 | 
102 | 	def fit_transform(self, data, y=None, input_split= False, merge_output= True, reset= True, minibatch_size=None,
103 | 	                  batcher= None):
104 | 		self.fit(data, y=y, input_split= input_split, reset=reset, minibatch_size= minibatch_size, batcher=batcher)
105 | 		return self.transform(data, y=y, input_split= input_split, merge_output= merge_output, batcher= None)
106 | 
107 | 	def partial_fit_transform(self, data, y=None, input_split=False, minibatch_size=None,
108 | 	                          batcher=None):
109 | 		return self.transform(data, y, input_split, reset=False, update=True, batcher=batcher)
110 | 
111 | 	def transform(self, data, y=None, input_split= False, merge_output= True, minibatch_size=None,
112 | 	              batcher= None):
113 | 		if input_split and merge_output and batcher is not None:  data= batcher.merge_batches(data)
114 | 		return data


--------------------------------------------------------------------------------
/wordbatch/transformers/text_normalizer.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | from __future__ import with_statement
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from __future__ import print_function
 6 | import re
 7 | import wordbatch.batcher
 8 | import wordbatch.pipelines
 9 | import wordbatch.transformers
10 | 
11 | def batch_transform(args):
12 | 	return args[1].batch_transform(args[0])
13 | 
14 | non_alphanums= re.compile(u'[^A-Za-z0-9]+')
15 | def default_normalize_text(text):
16 | 	return u" ".join([x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] if len(x)>1])
17 | 
18 | class TextNormalizer(object):
19 | 	def __init__(self, normalize_text= default_normalize_text, freeze= False, verbose= 1):
20 | 		self.verbose= verbose
21 | 		self.freeze= freeze
22 | 		self.non_alphanums = re.compile(u'[^A-Za-z0-9]+')
23 | 		self.normalize_text= normalize_text
24 | 		self.reset()
25 | 
26 | 	def reset(self):
27 | 		return self
28 | 
29 | 	def batch_transform(self, data):  return [self.normalize_text(text) for text in data]
30 | 
31 | 	def transform(self, data, input_split=False, merge_output=True, minibatch_size= None, batcher=None):
32 | 		if batcher is None:  batcher = wordbatch.batcher.Batcher()
33 | 		return batcher.process_batches(batch_transform, data, [self], input_split=input_split,
34 | 		                               merge_output=merge_output, minibatch_size= minibatch_size)
35 | 
36 | 	def fit(self, data, y=None, input_split=False, merge_output=True, minibatch_size= None, batcher=None):
37 | 		return self
38 | 
39 | 	def fit_transform(self, data, y=None, input_split=False, merge_output=True,
40 | 		                      minibatch_size= None, batcher=None):
41 | 		return self.transform(data, input_split, merge_output, minibatch_size, batcher)


--------------------------------------------------------------------------------
/wordbatch/transformers/tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import with_statement
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from __future__ import print_function
  6 | #from nltk.metrics import edit_distance
  7 | import Levenshtein #python-Levenshtein
  8 | from collections import defaultdict
  9 | 
 10 | def batch_get_dfs(args):
 11 | 	dft= defaultdict(int)
 12 | 	for text in args[0]:
 13 | 		for word in set(text.split(" ")):  dft[word]+= 1
 14 | 	return [dict(dft), len(args[0])]
 15 | 
 16 | def correct_spelling(word, dft, corrections_index, spellcor_count, spellcor_dist):
 17 | 	#T. Bocek, E. Hunt, B. Stiller: Fast Similarity Search in Large Dictionaries, 2007
 18 | 	if dft.get(word, 0)>spellcor_count or len(word)<3:  return word
 19 | 	max_df= -100000000000000
 20 | 	max_word= word
 21 | 	spell_suggestions= get_deletions(word, spellcor_dist)
 22 | 	candidates= {}
 23 | 	for x in spell_suggestions:
 24 | 		if x in corrections_index:
 25 | 			for y in corrections_index[x]:  candidates[y]= 1
 26 | 	#for word2 in list(candidates.keys()):
 27 | 	for word2 in candidates:
 28 | 		#score= edit_distance(word, word2, True)
 29 | 		score= Levenshtein.distance(word, word2)
 30 | 		if score>spellcor_dist:  continue
 31 | 		#score = float(dft[word2]) / score
 32 | 		score= dft[word2]
 33 | 		#score = Levenshtein.jaro_winkler(word, word2)
 34 | 		#score= dft[word2]*Levenshtein.jaro_winkler(word, word2)
 35 | 		if score > max_df:
 36 | 			max_df= score
 37 | 			max_word= word2
 38 | 	return max_word
 39 | 
 40 | def batch_correct_spellings(args):
 41 | 	corrs= args[1]
 42 | 	return [u" ".join([corrs.get(word, word) for word in text.split(" ")]) for text in args[0]]
 43 | 
 44 | def get_deletions(word, order):
 45 | 	stack = {word: order}
 46 | 	results = {}
 47 | 	while len(stack) > 0:
 48 | 		stack2 = {}
 49 | 		for word2 in stack:
 50 | 			order2 = stack[word2] - 1
 51 | 			for x in range(len(word2)):
 52 | 				if order2 != 0:  stack2[word2[:x] + word2[x + 1:]] = order2
 53 | 				results[word2[:x] + word2[x + 1:]] = 1
 54 | 		stack = stack2
 55 | 	return list(results.keys())
 56 | 
 57 | def make_corrections_index(dft, spellcor_count, spellcor_dist):
 58 | 	dft2 = {w[0]: w[1] for w in dft.items() if w[1] > spellcor_count}
 59 | 	corrections_index = defaultdict(list)
 60 | 	for word in dft2:
 61 | 		if len(word) > 15:  continue
 62 | 		for word2 in get_deletions(word, spellcor_dist):
 63 | 			corrections_index[word2].append(word)
 64 | 	return corrections_index
 65 | 
 66 | class Tokenizer(object):
 67 | 	def __init__(self, spellcor_count=0, spellcor_dist=2, stemmer= None, freeze= False, verbose= 0):
 68 | 		self.verbose= verbose
 69 | 		self.freeze= freeze
 70 | 		if spellcor_count == 0:
 71 | 			spellcor_dist = 0
 72 | 		elif spellcor_dist == 0:
 73 | 			spellcor_count = 0
 74 | 		self.spellcor_count = spellcor_count
 75 | 		self.spellcor_dist = spellcor_dist
 76 | 		self.stemmer = stemmer
 77 | 		self.reset()
 78 | 
 79 | 	def reset(self):
 80 | 		self.dft = {}
 81 | 		self.doc_count = 0
 82 | 		return self
 83 | 
 84 | 	def fit(self, data, y= None, input_split= False, reset= True, minibatch_size=None, batcher= None):
 85 | 		if reset:  self.reset()
 86 | 		if self.freeze:  return self
 87 | 		if batcher is None:  dfts, doc_counts= zip(*batch_get_dfs(data))
 88 | 		else:
 89 | 			dfts, doc_counts= zip(*batcher.collect_batches(
 90 | 				batcher.process_batches(batch_get_dfs, data, [], input_split= input_split, merge_output=False,
 91 | 				                        minibatch_size=minibatch_size)))
 92 | 		self.doc_count += sum(doc_counts)
 93 | 		dft = defaultdict(int, self.dft)
 94 | 		for dft2 in dfts:
 95 | 			for k, v in dft2.items():  dft[k] += v
 96 | 		self.dft= dict(dft)
 97 | 		return self
 98 | 
 99 | 	def partial_fit(self, data, y=None, input_split=False, minibatch_size=None, batcher=None):
100 | 		return self.fit(data, y, input_split, reset=False, minibatch_size= minibatch_size, batcher=batcher)
101 | 
102 | 	def fit_transform(self, data, y=None, input_split= False, merge_output= True, reset= True,
103 | 	                  minibatch_size=None, batcher= None):
104 | 		self.fit(data, y=y, input_split= input_split, reset=reset,  minibatch_size= minibatch_size, batcher=batcher)
105 | 		return self.transform(data, y=y, input_split=input_split, merge_output=merge_output,
106 | 		                      minibatch_size=minibatch_size, batcher=batcher)
107 | 
108 | 	def partial_fit_transform(self, data, y=None, input_split=False, minibatch_size=None, batcher=None):
109 | 		return self.transform(data, y, input_split, reset=False, update=True, minibatch_size=minibatch_size,
110 | 		                      batcher=batcher)
111 | 
112 | 	def transform(self, X, y=None, input_split= False, merge_output= True, minibatch_size=None, batcher= None):
113 | 		if self.verbose > 0:  print("Make word normalization dictionary")
114 | 		do_corrections= 1 if (self.spellcor_count > 0) and (self.spellcor_dist>0) else 0
115 | 		if not(do_corrections) and self.stemmer is None:  return X
116 | 		if do_corrections:
117 | 			corrections_index= make_corrections_index(self.dft, self.spellcor_count, self.spellcor_dist)
118 | 		if self.stemmer is not None:
119 | 			if do_corrections:
120 | 				corrs = {word: self.stemmer.stem(correct_spelling(
121 | 					word, self.dft, corrections_index, self.spellcor_count, self.spellcor_dist)) for word in self.dft}
122 | 			else:  corrs = {word: self.stemmer.stem(word) for word in self.dft}
123 | 		elif do_corrections:
124 | 			corrs = {word: correct_spelling(
125 | 				word, self.dft, corrections_index, self.spellcor_count, self.spellcor_dist) for word in self.dft}
126 | 		corrs = {key: value for key, value in corrs.items() if key != value}
127 | 		if self.verbose > 0:  print("Make word normalizations")
128 | 		if batcher is None:  return batch_correct_spellings(X)
129 | 		return batcher.process_batches(batch_correct_spellings, X, [corrs], input_split=input_split,
130 | 		                               merge_output=merge_output, minibatch_size= minibatch_size)
131 | 


--------------------------------------------------------------------------------