├── .circleci └── config.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── codes.rst ├── conf.py ├── faq.rst ├── images │ ├── nnlib_clstm.png │ └── nnlib_cnn.png ├── index.rst ├── install.rst ├── intro.rst ├── links.rst ├── news.rst ├── refs.rst ├── requirements.txt ├── scripts.rst ├── tutorial.rst ├── tutorial_charbaseonehot.rst ├── tutorial_charbaseseq2seq.rst ├── tutorial_dataprep.rst ├── tutorial_dtm.rst ├── tutorial_maxent.rst ├── tutorial_metrics.rst ├── tutorial_nnlib.rst ├── tutorial_spell.rst ├── tutorial_stacking.rst ├── tutorial_sumvec.rst ├── tutorial_textpreprocessing.rst ├── tutorial_topic.rst └── tutorial_wordembed.rst ├── pyproject.toml ├── shorttext ├── __init__.py ├── classifiers │ ├── __init__.py │ ├── bow │ │ ├── __init__.py │ │ ├── maxent │ │ │ ├── MaxEntClassification.py │ │ │ └── __init__.py │ │ └── topic │ │ │ ├── SkLearnClassification.py │ │ │ ├── TopicVectorDistanceClassification.py │ │ │ └── __init__.py │ └── embed │ │ ├── __init__.py │ │ ├── nnlib │ │ ├── VarNNEmbedVecClassification.py │ │ ├── __init__.py │ │ └── frameworks.py │ │ └── sumvec │ │ ├── SumEmbedVecClassification.py │ │ ├── VarNNSumEmbedVecClassification.py │ │ ├── __init__.py │ │ └── frameworks.py ├── cli │ ├── __init__.py │ ├── categorization.py │ └── wordembedsim.py ├── data │ ├── __init__.py │ ├── data_retrieval.py │ └── shorttext_exampledata.csv ├── generators │ ├── __init__.py │ ├── bow │ │ ├── AutoEncodingTopicModeling.py │ │ ├── GensimTopicModeling.py │ │ ├── LatentTopicModeling.py │ │ └── __init__.py │ ├── charbase │ │ ├── __init__.py │ │ └── char2vec.py │ └── seq2seq │ │ ├── __init__.py │ │ ├── charbaseS2S.py │ │ └── s2skeras.py ├── metrics │ ├── __init__.py │ ├── dynprog │ │ ├── __init__.py │ │ ├── dldist.py │ │ ├── jaccard.py │ │ └── lcp.py │ ├── embedfuzzy │ │ ├── __init__.py │ │ └── jaccard.py │ ├── transformers │ │ ├── __init__.py │ │ └── bertscore.py │ └── wasserstein │ │ ├── __init__.py │ │ └── wordmoverdist.py ├── smartload.py ├── spell │ ├── __init__.py │ ├── basespellcorrector.py │ ├── binarize.py │ ├── editor.py │ ├── norvig.py │ └── sakaguchi.py ├── stack │ ├── __init__.py │ └── stacking.py └── utils │ ├── __init__.py │ ├── classification_exceptions.py │ ├── compactmodel_io.py │ ├── dtm.py │ ├── gensim_corpora.py │ ├── kerasmodel_io.py │ ├── misc.py │ ├── nonneg_stopwords.txt │ ├── stopwords.txt │ ├── textpreprocessing.py │ ├── transformers.py │ └── wordembed.py └── test ├── __init__.py ├── test_charonehot.py ├── test_dtm.py ├── test_fuzzylogic.py ├── test_norvigspell.py ├── test_sakaguchispell.py ├── test_stacking.py ├── test_textpreprocessing.py ├── test_var_nn_embedded_vec_classifier.py └── test_wmd.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | 4 | shared: &shared 5 | working_directory: ~/shorttext 6 | 7 | steps: 8 | - checkout 9 | 10 | - run: 11 | name: Apt Install 12 | command: | 13 | sudo apt-get update 14 | sudo apt-get install libc6 15 | sudo apt-get install python3-dev 16 | sudo apt-get install -y g++ 17 | 18 | - run: 19 | name: Installing Miniconda and Packages 20 | command: | 21 | pip install --upgrade --user pip 22 | pip install --upgrade --user google-compute-engine 23 | pip install --user . 24 | 25 | - run: 26 | name: Run Unit Tests 27 | command: | 28 | pip install --user .[test] 29 | pytest 30 | 31 | 32 | jobs: 33 | py39: 34 | <<: *shared 35 | docker: 36 | - image: cimg/python:3.9 37 | 38 | py310: 39 | <<: *shared 40 | docker: 41 | - image: cimg/python:3.10 42 | 43 | py311: 44 | <<: *shared 45 | docker: 46 | - image: cimg/python:3.11 47 | 48 | py312: 49 | <<: *shared 50 | docker: 51 | - image: cimg/python:3.12 52 | 53 | 54 | workflows: 55 | version: 2 56 | build: 57 | jobs: 58 | - py39 59 | - py310 60 | - py311 61 | - py312 62 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | build: 13 | os: ubuntu-22.04 14 | tools: 15 | python: "3.10" 16 | 17 | # Build documentation with MkDocs 18 | #mkdocs: 19 | # configuration: mkdocs.yml 20 | 21 | # Optionally build your docs in additional formats such as PDF and ePub 22 | formats: all 23 | 24 | # Optionally set the version of Python and requirements required to build your docs 25 | python: 26 | install: 27 | - requirements: docs/requirements.txt 28 | 29 | # conda environment 30 | #conda: 31 | # environment: environment.yml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Kwan Yuet Stephen Ho 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include pyproject.toml 4 | include shorttext/data/shorttext_exampledata.csv 5 | include shorttext/utils/stopwords.txt 6 | include shorttext/utils/nonneg_stopwords.txt 7 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/shorttext.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/shorttext.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/shorttext" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/shorttext" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/codes.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | API unlisted in tutorials are listed here. 5 | 6 | Shorttext Models Smart Loading 7 | ------------------------------ 8 | 9 | .. automodule:: shorttext.smartload 10 | :members: 11 | 12 | Supervised Classification using Word Embedding 13 | ---------------------------------------------- 14 | 15 | Module `shorttext.generators.seq2seq.s2skeras` 16 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 17 | 18 | .. automodule:: shorttext.generators.seq2seq.s2skeras 19 | :members: 20 | 21 | 22 | Module `shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification` 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | 25 | .. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification 26 | :members: 27 | 28 | 29 | Neural Networks 30 | --------------- 31 | 32 | Module `shorttext.classifiers.embed.sumvec.frameworks` 33 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 34 | 35 | .. automodule:: shorttext.classifiers.embed.sumvec.frameworks 36 | :members: 37 | 38 | 39 | Utilities 40 | --------- 41 | 42 | Module `shorttext.utils.kerasmodel_io` 43 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 44 | 45 | .. automodule:: shorttext.utils.kerasmodel_io 46 | :members: 47 | 48 | Module `shorttext.utils.gensim_corpora` 49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 50 | 51 | .. automodule:: shorttext.utils.gensim_corpora 52 | :members: 53 | 54 | Module `shorttext.utils.compactmodel_io` 55 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 56 | 57 | .. automodule:: shorttext.utils.compactmodel_io 58 | :members: 59 | 60 | 61 | Metrics 62 | ------- 63 | 64 | Module `shorttext.metrics.dynprog` 65 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 66 | 67 | .. automodule:: shorttext.metrics.dynprog.jaccard 68 | :members: 69 | 70 | .. automodule:: shorttext.metrics.dynprog.dldist 71 | :members: 72 | 73 | .. automodule:: shorttext.metrics.dynprog.lcp 74 | :members: 75 | 76 | Module `shorttext.metrics.wassersterin` 77 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 78 | 79 | .. automodule:: shorttext.metrics.wasserstein.wordmoverdist 80 | :members: word_mover_distance_linprog 81 | 82 | Spell Correction 83 | ---------------- 84 | 85 | Module `shorttext.spell` 86 | ^^^^^^^^^^^^^^^^^^^^^^^^ 87 | 88 | .. automodule:: shorttext.spell.basespellcorrector 89 | :members: 90 | 91 | 92 | 93 | 94 | 95 | 96 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | Frequently Asked Questions (FAQ) 2 | ================================ 3 | 4 | **Q1. Can we use backends other than TensorFlow?** 5 | 6 | Ans: No. 7 | 8 | 9 | **Q2. Can we use word-embedding algorithms other than Word2Vec?** 10 | 11 | Ans: Yes. Besides Word2Vec, you can use FastText and Poincaré embedding. See: :doc:`tutorial_wordembed` . 12 | 13 | 14 | **Q3. Can this package work on Python 2?** 15 | 16 | Ans: No. 17 | 18 | 19 | 20 | **Q4. How should I cite `shorttext` if I use it in my research?** 21 | 22 | Ans: For the time being, You do not have to cite a particular paper for using this package. 23 | However, if you use any particular functions or class, check out the docstring. If there is a paper (or papers) 24 | mentioned, cite those papers. For example, if you use `CNNWordEmbed` in `frameworks 25 | `_, 26 | according to the docstring, cite Yoon Kim's paper. Refer to this documentation for the reference too. 27 | 28 | 29 | 30 | **Q5. I am having trouble in install `shorttext` on Google Cloud Platform. What should I do?** 31 | 32 | Ans: There is no "Python.h". Run: `sudo apt-get install python3-dev` in SSH shell of the VM instance. 33 | 34 | **Q8. My model files were created by `shorttext` version < 2.0.0. How do I make them readable for version >= 2.0.0? 35 | 36 | Ans: Simply make those files with names ending with `.h5` to `.weights.h5`. 37 | 38 | 39 | 40 | Home: :doc:`index` 41 | -------------------------------------------------------------------------------- /docs/images/nnlib_clstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/docs/images/nnlib_clstm.png -------------------------------------------------------------------------------- /docs/images/nnlib_cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/docs/images/nnlib_cnn.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. shorttext documentation master file, created by 2 | sphinx-quickstart on Fri Nov 11 18:11:01 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Homepage of `shorttext` 7 | ======================= 8 | 9 | This repository is a collection of algorithms for multi-class classification to short texts using Python. 10 | Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report 11 | issues through the Issue_ tab of the Github_ page. This is a PyPI_ project. This is an open-source 12 | project under the `MIT License 13 | `_ . 14 | 15 | Contents: 16 | 17 | .. toctree:: 18 | :maxdepth: 1 19 | 20 | intro 21 | install 22 | tutorial 23 | scripts 24 | codes 25 | faq 26 | refs 27 | links 28 | news 29 | 30 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization 31 | .. _Issue: https://github.com/stephenhky/PyShortTextCategorization/issues 32 | .. _PyPI: https://pypi.org/project/shorttext/ 33 | 34 | Indices and tables 35 | ================== 36 | 37 | * :ref:`genindex` 38 | * :ref:`modindex` 39 | * :ref:`search` 40 | 41 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | PIP 5 | --- 6 | 7 | Package `shorttext` runs in Python 3.9, 3.10, 3.11, and 3.12. However, for Python>=3.7, the backend 8 | of keras_ cannot be Tensorflow_. 9 | 10 | To install the package in Linux or OS X, enter the following in the console: 11 | 12 | :: 13 | 14 | pip install shorttext 15 | 16 | It is very possible that you have to do it as root, that you have to add ``sudo`` in 17 | front of the command. 18 | 19 | On the other hand, to get the development version on Github, you can install from Github_: 20 | 21 | :: 22 | 23 | pip install git+https://github.com/stephenhky/PyShortTextCategorization@master 24 | 25 | 26 | Backend for Keras 27 | ----------------- 28 | 29 | We use TensorFlow for `keras`. 30 | 31 | Possible Solutions for Installation Failures 32 | -------------------------------------------- 33 | 34 | Most developers can install `shorttext` with the instructions above. If the installation fails, 35 | you may try one (or more) of the following: 36 | 37 | 1. Installing Python-dev by typing: 38 | 39 | 40 | :: 41 | 42 | pip install python3-dev 43 | 44 | 45 | 46 | 2. Installing `gcc` by entering 47 | 48 | :: 49 | 50 | apt-get install libc6 51 | 52 | 53 | 54 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization 55 | 56 | 57 | Home: :doc:`index` 58 | 59 | .. _Numpy: http://www.numpy.org/ 60 | .. _SciPy: https://www.scipy.org/ 61 | .. _Scikit-Learn: http://scikit-learn.org/stable/ 62 | .. _Tensorflow: https://www.tensorflow.org/ 63 | .. _Theano: http://deeplearning.net/software/theano/ 64 | .. _CNTK: https://github.com/Microsoft/CNTK/wiki 65 | .. _keras: https://keras.io/ 66 | .. _gensim: https://radimrehurek.com/gensim/ 67 | .. _Pandas: http://pandas.pydata.org/ 68 | .. _snowballstemmer: https://github.com/snowballstem/snowball 69 | .. _Joblib: https://joblib.readthedocs.io/en/latest/ -------------------------------------------------------------------------------- /docs/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | This package `shorttext` is a Python package that facilitates supervised and unsupervised 5 | learning for short text categorization. Due to the sparseness of words and 6 | the lack of information carried in the short texts themselves, an intermediate 7 | representation of the texts and documents are needed before they are put into 8 | any classification algorithm. In this package, it facilitates various types 9 | of these representations, including topic modeling and word-embedding algorithms. 10 | 11 | The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12. 12 | 13 | Characteristics: 14 | 15 | - example data provided (including subject keywords and NIH RePORT); (see :doc:`tutorial_dataprep`) 16 | - text preprocessing; (see :doc:`tutorial_textpreprocessing`) 17 | - pre-trained word-embedding support; (see :doc:`tutorial_wordembed`) 18 | - `gensim` topic models (LDA, LSI, Random Projections) and autoencoder; (see :doc:`tutorial_topic`) 19 | - topic model representation supported for supervised learning using `scikit-learn`; (see :doc:`tutorial_topic`) 20 | - cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_sumvec`) 21 | - neural network classification (including ConvNet, and C-LSTM); (see :doc:`tutorial_nnlib`) 22 | - maximum entropy classification; (see :doc:`tutorial_maxent`) 23 | - metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); (see :doc:`tutorial_metrics`) 24 | - character-level sequence-to-sequence (seq2seq) learning; (see :doc:`tutorial_charbaseseq2seq`) 25 | - spell correction; (see :doc:`tutorial_spell`) 26 | - Sentence encodings and similarities based on BERT (see :doc:`tutorial_wordembed` and :doc:`tutorial_metrics`). 27 | 28 | Author: Kwan Yuet Stephen Ho (LinkedIn_, ResearchGate_, Twitter_) 29 | Other contributors: `Chinmaya Pancholi `_, `Minseo Kim `_ 30 | 31 | Home: :doc:`index` 32 | 33 | .. _LinkedIn: https://www.linkedin.com/in/kwan-yuet-ho-19882530 34 | .. _ResearchGate: https://www.researchgate.net/profile/Kwan-yuet_Ho 35 | .. _Twitter: https://twitter.com/stephenhky 36 | -------------------------------------------------------------------------------- /docs/links.rst: -------------------------------------------------------------------------------- 1 | Links 2 | ===== 3 | 4 | Project Codes and Package 5 | ------------------------- 6 | 7 | - Github_ 8 | - PyPI_ 9 | 10 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization 11 | 12 | .. _PyPI: https://pypi.org/project/shorttext/ 13 | 14 | Issues 15 | ------ 16 | 17 | To report bugs and issues, please go to Issues_. 18 | 19 | .. _Issues: https://github.com/stephenhky/PyShortTextCategorization/issues 20 | 21 | Gensim Incubator 22 | ---------------- 23 | 24 | Chinmaya Pancholi, a student in Indian Institute of Technology, Kharagpur, is supported 25 | by Google Summer of Code (GSoC) project to support the open-source project for `gensim`. 26 | Part of his project is to employ the wrapping ideas in `shorttext` to integrate `keras`, 27 | `scikit-learn` and `gensim`. 28 | 29 | Chinmaya's blog posts: `https://rare-technologies.com/author/chinmaya/ 30 | `_ 31 | 32 | Chinmaya's proposal for GSoC: `https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md 33 | `_ 34 | 35 | 36 | Blog Entries 37 | ------------ 38 | 39 | "R or Python on Text Mining," *Everything About Data Analytics*, WordPress (2015). [`WordPress 40 | `_] 41 | 42 | "Short Text Categorization using Deep Neural Networks and Word-Embedding Models," *Everything About Data Analytics*, WordPress (2015). [`WordPress 43 | `_] 44 | (A code demonstration can be found in an early version of the Github repository for this package: `here 45 | `_) 46 | 47 | "Toying with Word2Vec," *Everything About Data Analytics*, WordPress (2015). [`WordPress 48 | `_] 49 | 50 | "Probabilistic Theory of Word Embeddings: GloVe," *Everything About Data Analytics*, WordPress (2016). [`WordPress 51 | `_] 52 | 53 | "Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress 54 | `_] 55 | 56 | "Python Package for Short Text Mining," *Everything About Data Analytics*, WordPress (2016). [`WordPress 57 | `_] 58 | 59 | "Short Text Mining using Advanced Keras Layers and Maxent: shorttext 0.4.1," *Everything About Data Analytics*, WordPress (2017). [`WordPress 60 | `_] 61 | 62 | "Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress 63 | `_] 64 | 65 | "Release of shorttext 0.5.4," *Everything About Data Analytics*, WordPress (2017). [`WordPress 66 | `_] 67 | 68 | "Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress 69 | `_] 70 | 71 | "Package shorttext 1.0.0 Released," Medium (2018). [`Medium 72 | `_] 73 | 74 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/refs.rst: -------------------------------------------------------------------------------- 1 | References 2 | ========== 3 | 4 | Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM 5 | `_] 6 | 7 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly 8 | `_] 9 | 10 | Chinmaya Pancholi, "Gensim integration with scikit-learn and Keras," *Google Summer of Codes* (GSoC) proposal (2017). [`Github 11 | `_] 12 | 13 | Chinmaya Pancholi, "Chinmaya’s GSoC 2017 Summary: Integration with sklearn & Keras and implementing fastText," *RaRe Incubator* (September 2, 2017). [`RaRe 14 | `_] 15 | 16 | Christopher Manning, Hinrich Schütze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press 17 | `_] 18 | 19 | Christopher D. Manning, Prabhakar Raghavan, Hinrich Schütze, *Introduction to Information Retrieval* (Cambridge, MA: Cambridge University Press, 2008). [`StanfordNLP 20 | `_] 21 | 22 | Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv 23 | `_] 24 | 25 | Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE 26 | `_] 27 | 28 | Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ 29 | `_] 30 | 31 | Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare 32 | `_] 33 | 34 | David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992). 35 | 36 | David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012). [`ACM 37 | `_] 38 | 39 | Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras 40 | `_] 41 | 42 | Francois Chollet, "Building Autoencoders in Keras," *The Keras Blog*. [`Keras 43 | `_] 44 | 45 | Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU 46 | `_] 47 | 48 | Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto 49 | `_] 50 | 51 | Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv 52 | `_] 53 | 54 | Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe 55 | `_] 56 | 57 | Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF 58 | `_] 59 | 60 | Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv 61 | `_] 62 | 63 | "Keras 2.0 Release Notes." (2017) [`Github 64 | `_] 65 | 66 | Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). 67 | 68 | Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv 69 | `_] 70 | 71 | Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs 72 | `_] 73 | 74 | M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization," 75 | *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015). 76 | 77 | Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv 78 | `_] 79 | 80 | Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv 81 | `_] 82 | 83 | Peter Norvig, "How to write a spell corrector." (2016) [`Norvig 84 | `_] 85 | 86 | Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv 87 | `_] 88 | 89 | Radim Rehurek, Petr Sojka, "Software Framework for Topic Modelling with Large Corpora," In Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks (2010). [`ResearchGate 90 | `_] 91 | 92 | Sebastian Ruder, "An overview of gradient descent optimization algorithms," blog of Sebastian Ruder, arXiv:1609.04747 (2016). [`Ruder 93 | `_ or `arXiv 94 | `_] 95 | 96 | Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium 97 | `_] 98 | 99 | Thomas W. Jones, "textmineR: Functions for Text Mining and Topic Modeling," CRAN Project. [`CRAN 100 | `_ or `Github 101 | `_] 102 | 103 | Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv 104 | `_] 105 | 106 | Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv 107 | `_] 108 | 109 | Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, 110 | "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," 111 | *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). 112 | 113 | Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," 114 | WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL 115 | `_] 116 | 117 | Yoon Kim, "Convolutional Neural Networks for Sentence Classification," *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv 118 | `_] 119 | 120 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv 121 | `_] 122 | 123 | 124 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==2.2.6 2 | scipy==1.15.3 3 | joblib==1.5.1 4 | scikit-learn==1.7.0 5 | tensorflow==2.19.0 6 | keras==3.10.0 7 | gensim==4.3.3 8 | pandas==2.3.0 9 | snowballstemmer==3.0.1 10 | transformers==4.52.4 11 | torch==2.7.1 12 | numba==0.61.2 13 | -------------------------------------------------------------------------------- /docs/scripts.rst: -------------------------------------------------------------------------------- 1 | Console Scripts 2 | =============== 3 | 4 | This package provides two scripts. 5 | 6 | The development of the scripts is *not stable* yet, and more scripts will be added. 7 | 8 | ShortTextCategorizerConsole 9 | --------------------------- 10 | 11 | :: 12 | 13 | usage: ShortTextCategorizerConsole [-h] [--wv WV] [--vecsize VECSIZE] 14 | [--topn TOPN] [--inputtext INPUTTEXT] 15 | [--type TYPE] 16 | model_filepath 17 | 18 | Perform prediction on short text with a given trained model. 19 | 20 | positional arguments: 21 | model_filepath Path of the trained (compact) model. 22 | 23 | options: 24 | -h, --help show this help message and exit 25 | --wv WV Path of the pre-trained Word2Vec model. (None if not 26 | needed) 27 | --vecsize VECSIZE Vector dimensions. (Default: 300) 28 | --topn TOPN Number of top-scored results displayed. (Default: 10) 29 | --inputtext INPUTTEXT 30 | single input text for classification. Run console if 31 | set to None. (Default: None) 32 | --type TYPE Type of word-embedding model (default: "word2vec"; 33 | other options: "fasttext", "poincare", 34 | "word2vec_nonbinary", "poincare_binary") 35 | 36 | 37 | ShortTextWordEmbedSimilarity 38 | ---------------------------- 39 | 40 | :: 41 | 42 | usage: ShortTextWordEmbedSimilarity [-h] [--type TYPE] modelpath 43 | 44 | Find the similarities between two short sentences using Word2Vec. 45 | 46 | positional arguments: 47 | modelpath Path of the Word2Vec model 48 | 49 | optional arguments: 50 | -h, --help show this help message and exit 51 | --type TYPE Type of word-embedding model (default: "word2vec"; other 52 | options: "fasttext", "poincare") 53 | 54 | 55 | Home: :doc:`index` 56 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | After installation, you are ready to start testing the convenience and power 5 | of the package. 6 | 7 | Before using, type 8 | 9 | >>> import shorttext 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | tutorial_dataprep 15 | tutorial_textpreprocessing 16 | tutorial_dtm 17 | tutorial_charbaseonehot 18 | tutorial_topic 19 | tutorial_wordembed 20 | tutorial_sumvec 21 | tutorial_nnlib 22 | tutorial_maxent 23 | tutorial_charbaseseq2seq 24 | tutorial_stacking 25 | tutorial_metrics 26 | tutorial_spell 27 | 28 | 29 | Home: :doc:`index` 30 | -------------------------------------------------------------------------------- /docs/tutorial_charbaseonehot.rst: -------------------------------------------------------------------------------- 1 | Character to One-Hot Vector 2 | =========================== 3 | 4 | Since version 0.6.1, the package `shorttext` deals with character-based model. A first important 5 | component of character-based model is to convert every character to a one-hot vector. We provide a class 6 | :class:`shorttext.generators.SentenceToCharVecEncoder` to deal with this. Thi class incorporates 7 | the `OneHotEncoder` in `scikit-learn` and `Dictionary` in `gensim`. 8 | 9 | To use this, import the packages first: 10 | 11 | >>> import numpy as np 12 | >>> import shorttext 13 | 14 | Then we incorporate a text file as the source of all characters to be coded. In this case, we choose 15 | the file `big.txt` in Peter Norvig's websites: 16 | 17 | >>> from urllib.request import urlopen 18 | >>> textfile = urlopen('http://norvig.com/big.txt', 'r') 19 | 20 | Then instantiate the class using the function :func:`shorttext.generators.initSentenceToCharVecEncoder`: 21 | 22 | >>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(textfile) 23 | 24 | Now, the object `chartovec_encoder` is an instance of :class:`shorttext.generators.SentenceToCharVecEncoder` . The 25 | default signal character is `\n`, which is also encoded, and can be checked by looking at the field: 26 | 27 | >>> chartovec_encoder.signalchar 28 | 29 | We can convert a sentence into a bunch of one-hot vectors in terms of a matrix. For example, 30 | 31 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100) 32 | <1x93 sparse matrix of type '' 33 | with 1 stored elements in Compressed Sparse Column format> 34 | 35 | This outputs a sparse matrix. Depending on your needs, you can add signal character to the beginning 36 | or the end of the sentence in the output matrix by: 37 | 38 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=True, endsig=False) 39 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=False, endsig=True) 40 | 41 | We can also convert a list of sentences by 42 | 43 | >>> chartovec_encoder.encode_sentences(sentences, 100, startsig=False, endsig=True, sparse=False) 44 | 45 | You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`. 46 | 47 | 48 | .. automodule:: shorttext.generators.charbase.char2vec 49 | :members: 50 | 51 | 52 | Reference 53 | --------- 54 | 55 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly 56 | `_] 57 | 58 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_charbaseseq2seq.rst: -------------------------------------------------------------------------------- 1 | Character-Based Sequence-to-Sequence (seq2seq) Models 2 | ===================================================== 3 | 4 | Since release 0.6.0, `shorttext` supports sequence-to-sequence (seq2seq) learning. While there is a general seq2seq class 5 | behind, it provides a character-based seq2seq implementation. 6 | 7 | Creating One-hot Vectors 8 | ------------------------ 9 | 10 | To use it, create an instance of the class :class:`shorttext.generators.SentenceToCharVecEncoder`: 11 | 12 | >>> import numpy as np 13 | >>> import shorttext 14 | >>> from urllib.request import urlopen 15 | >>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) 16 | 17 | The above code is the same as :doc:`tutorial_charbaseonehot` . 18 | 19 | .. automodule:: shorttext.generators.charbase.char2vec 20 | :members: initSentenceToCharVecEncoder 21 | 22 | 23 | Training 24 | -------- 25 | 26 | Then we can train the model by creating an instance of :class:`shorttext.generators.CharBasedSeq2SeqGenerator`: 27 | 28 | >>> latent_dim = 100 29 | >>> seq2seqer = shorttext.generators.CharBasedSeq2SeqGenerator(chartovec_encoder, latent_dim, 120) 30 | 31 | And then train this neural network model: 32 | 33 | >>> seq2seqer.train(text, epochs=100) 34 | 35 | This model takes several hours to train on a laptop. 36 | 37 | 38 | .. autoclass:: shorttext.generators.seq2seq.charbaseS2S.CharBasedSeq2SeqGenerator 39 | :members: 40 | 41 | Decoding 42 | -------- 43 | 44 | After training, we can use this class as a generative model 45 | of answering questions as a chatbot: 46 | 47 | >>> seq2seqer.decode('Happy Holiday!') 48 | 49 | It does not give definite answers because there is a stochasticity in the prediction. 50 | 51 | Model I/O 52 | --------- 53 | 54 | This model can be saved by entering: 55 | 56 | >>> seq2seqer.save_compact_model('/path/to/norvigtxt_iter5model.bin') 57 | 58 | And can be loaded by: 59 | 60 | >>> seq2seqer2 = shorttext.generators.seq2seq.charbaseS2S.loadCharBasedSeq2SeqGenerator('/path/to/norvigtxt_iter5model.bin') 61 | 62 | .. automodule:: shorttext.generators.seq2seq.charbaseS2S 63 | :members: loadCharBasedSeq2SeqGenerator 64 | 65 | 66 | Reference 67 | --------- 68 | 69 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly 70 | `_] 71 | 72 | Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto 73 | `_] 74 | 75 | Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv 76 | `_] 77 | 78 | Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv 79 | `_] 80 | 81 | Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv 82 | `_] 83 | 84 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv 85 | `_] 86 | 87 | -------------------------------------------------------------------------------- /docs/tutorial_dataprep.rst: -------------------------------------------------------------------------------- 1 | Data Preparation 2 | ================ 3 | 4 | This package deals with short text. While the text data for predictions or 5 | classifications are simply `str` or list of `str`, the training data does 6 | take a specific format, in terms of `dict`, the Python dictionary (or hash 7 | map). The package provides two sets of data as an example. 8 | 9 | Example Training Data 1: Subject Keywords 10 | ----------------------------------------- 11 | 12 | The first example dataset is about the subject keywords, which can be loaded by: 13 | 14 | >>> trainclassdict = shorttext.data.subjectkeywords() 15 | 16 | This returns a dictionary, with keys being the label and the values being lists of 17 | the subject keywords, as below: 18 | 19 | :: 20 | 21 | {'mathematics': ['linear algebra', 'topology', 'algebra', 'calculus', 22 | 'variational calculus', 'functional field', 'real analysis', 'complex analysis', 23 | 'differential equation', 'statistics', 'statistical optimization', 'probability', 24 | 'stochastic calculus', 'numerical analysis', 'differential geometry'], 25 | 'physics': ['renormalization', 'classical mechanics', 'quantum mechanics', 26 | 'statistical mechanics', 'functional field', 'path integral', 27 | 'quantum field theory', 'electrodynamics', 'condensed matter', 28 | 'particle physics', 'topological solitons', 'astrophysics', 29 | 'spontaneous symmetry breaking', 'atomic molecular and optical physics', 30 | 'quantum chaos'], 31 | 'theology': ['divine providence', 'soteriology', 'anthropology', 'pneumatology', 'Christology', 32 | 'Holy Trinity', 'eschatology', 'scripture', 'ecclesiology', 'predestination', 33 | 'divine degree', 'creedal confessionalism', 'scholasticism', 'prayer', 'eucharist']} 34 | 35 | 36 | .. automodule:: shorttext.data.data_retrieval 37 | :members: subjectkeywords 38 | 39 | Example Training Data 2: NIH RePORT 40 | ----------------------------------- 41 | 42 | The second example dataset is from NIH RePORT (Research Portfolio Online Reporting Tools). 43 | The data can be downloaded from its `ExPORTER 44 | `_ page. The current data in this package was directly 45 | adapted from Thomas Jones' `textMineR 46 | `_ R package. 47 | 48 | Enter: 49 | 50 | >>> trainclassdict = shorttext.data.nihreports() 51 | 52 | Upon the installation of the package, the NIH RePORT data are still not 53 | installed. But the first time it was ran, it will be downloaded from the Internet. 54 | 55 | This will output a similar dictionary with FUNDING_IC (Institutes and Centers in NIH) 56 | as the class labels, and PROJECT_TITLE (title of the funded projects) 57 | as the short texts under the corresponding labels. This dictionary has 512 projects in total, 58 | randomly drawn from the original data. 59 | 60 | However, there are other configurations: 61 | 62 | .. automodule:: shorttext.data.data_retrieval 63 | :members: nihreports 64 | 65 | 66 | Example Training Data 3: Inaugural Addresses 67 | -------------------------------------------- 68 | 69 | This contains all the Inaugural Addresses of all the Presidents of the United States, from 70 | George Washington to Barack Obama. Upon the installation of the package, the Inaugural Addresses 71 | data are still not installed. But the first time it was ran, it will be downloaded from the Internet. 72 | 73 | The addresses are available publicly, and I extracted them from `nltk 74 | `_ package. 75 | 76 | Enter: 77 | 78 | >>> trainclassdict = shorttext.data.inaugural() 79 | 80 | .. automodule:: shorttext.data.data_retrieval 81 | :members: inaugural 82 | 83 | 84 | User-Provided Training Data 85 | --------------------------- 86 | 87 | Users can provide their own training data. If it is already in JSON format, it can be loaded easily 88 | with standard Python's `json` package, or by calling: 89 | 90 | >>> trainclassdict = shorttext.data.retrieve_jsondata_as_dict('/path/to/file.json') 91 | 92 | However, if it is in CSV format, it has to obey the rules: 93 | 94 | - there is a heading; and 95 | - there are at least two columns: first the labels, and second the short text under the labels (everything being the second column will be neglected). 96 | 97 | An excerpt of this type of data is as follow: 98 | 99 | :: 100 | 101 | subject,content 102 | mathematics,linear algebra 103 | mathematics,topology 104 | mathematics,algebra 105 | ... 106 | physics,spontaneous symmetry breaking 107 | physics,atomic molecular and optical physics 108 | physics,quantum chaos 109 | ... 110 | theology,divine providence 111 | theology,soteriology 112 | theology,anthropology 113 | 114 | To load this data file, just enter: 115 | 116 | >>> trainclassdict = shorttext.data.retrieve_csvdata_as_dict('/path/to/file.csv') 117 | 118 | .. automodule:: shorttext.data.data_retrieval 119 | :members: retrieve_csvdata_as_dict 120 | 121 | 122 | Home: :doc:`index` 123 | -------------------------------------------------------------------------------- /docs/tutorial_dtm.rst: -------------------------------------------------------------------------------- 1 | Document-Term Matrix 2 | ==================== 3 | 4 | Preparing for the Corpus 5 | ------------------------ 6 | 7 | We can create and handle document-term matrix (DTM) with `shorttext`. Use the dataset of Presidents' 8 | Inaugural Addresses as an example. 9 | 10 | >>> import shorttext 11 | >>> usprez = shorttext.data.inaugural() 12 | 13 | We have to make each presidents' address to be one document to achieve our purpose. Enter this: 14 | 15 | >>> docids = sorted(usprez.keys()) 16 | >>> usprez = [' '.join(usprez[docid]) for docid in docids] 17 | 18 | Now the variable `usprez` is a list of 56 Inaugural Addresses from George Washington (1789) to 19 | Barack Obama (2009), with the IDs stored in `docids`. We apply the standard text preprocessor and 20 | produce a list of lists (of tokens) (or a corpus in `gensim`): 21 | 22 | >>> preprocess = shorttext.utils.standard_text_preprocessor_1() 23 | >>> corpus = [preprocess(address).split(' ') for address in usprez] 24 | 25 | Then now the variable `corpus` is a list of lists of tokens. For example, 26 | 27 | >>> corpus[0] # shows all the preprocessed tokens of the first Presidential Inaugural Addresses 28 | 29 | Using Class `DocumentTermMatrix` 30 | -------------------------------- 31 | 32 | With the corpus ready in this form, we can create a `DocumentTermMatrix` class for DTM by: 33 | 34 | >>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids) 35 | 36 | .. autoclass:: shorttext.utils.dtm.DocumentTermMatrix 37 | :members: 38 | 39 | One can get the document frequency of any token (the number of documents that the given 40 | token is in) by: 41 | 42 | >>> usprez_dtm.get_doc_frequency('peopl') # gives 54, the document frequency of the token "peopl" 43 | 44 | or the total term frequencies (the total number of occurrences of the given tokens in all documents) by: 45 | 46 | >>> usprez_dtm.get_total_termfreq('justic') # gives 134.0, the total term frequency of the token "justic" 47 | 48 | or the term frequency for a token in a given document by: 49 | 50 | >>> usprez_dtm.get_termfreq('2009-Obama', 'chang') # gives 2.0 51 | 52 | We can also query the number of occurrences of a particular word of all documents, 53 | stored in a dictionary, by: 54 | 55 | >>> usprez_dtm.get_token_occurences('god') 56 | 57 | Of course, we can always reweigh the counts above (except document frequency) by imposing 58 | tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`: 59 | 60 | >>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) 61 | 62 | To save the class, enter: 63 | 64 | >>> usprez_dtm.save_compact_model('/path/to/whatever.bin') 65 | 66 | To load this class later, enter: 67 | 68 | >>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin') 69 | 70 | .. automodule:: shorttext.utils.dtm 71 | :members: load_DocumentTermMatrix 72 | 73 | Reference 74 | --------- 75 | 76 | Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press 77 | `_] 78 | 79 | "Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress 80 | `_] 81 | 82 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_maxent.rst: -------------------------------------------------------------------------------- 1 | Maximum Entropy (MaxEnt) Classifier 2 | =================================== 3 | 4 | Maxent 5 | ------ 6 | 7 | Maximum entropy (maxent) classifier has been a popular text classifier, by parameterizing the model 8 | to achieve maximum categorical entropy, with the constraint that the resulting probability 9 | on the training data with the model being equal to the real distribution. 10 | 11 | The maxent classifier in `shorttext` is impleneted by `keras`. The optimization algorithm is 12 | defaulted to be the Adam optimizer, although other gradient-based or momentum-based optimizers 13 | can be used. The traditional methods such as generative iterative scaling (GIS) or 14 | L-BFGS cannot be used here. 15 | 16 | To use the maxent classifier, import the package: 17 | 18 | >>> import shorttext 19 | >>> from shorttext.classifiers import MaxEntClassifier 20 | 21 | Loading NIH reports as an example: 22 | 23 | >>> classdict = shorttext.data.nihreports() 24 | 25 | The classifier can be instantiated by: 26 | 27 | >>> classifier = MaxEntClassifier() 28 | 29 | Train the classifier: 30 | 31 | >>> classifier.train(classdict, nb_epochs=1000) 32 | 33 | After training, it can be used for classification, such as 34 | 35 | >>> classifier.score('cancer immunology') # NCI tops the score 36 | >>> classifier.score('children health') # NIAID tops the score 37 | >>> classifier.score('Alzheimer disease and aging') # NIAID tops the score 38 | 39 | To save the model, 40 | 41 | >>> classifier.save_compact_model('/path/to/filename.bin') 42 | 43 | To load the model to be a classifier, enter: 44 | 45 | >>> classifier2 = shorttext.classifiers.load_maxent_classifier('/path/to/filename.bin') 46 | 47 | 48 | .. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification 49 | :members: 50 | 51 | 52 | Reference 53 | --------- 54 | 55 | Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM 56 | `_] 57 | 58 | Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ 59 | `_] 60 | 61 | Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare 62 | `_] 63 | 64 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_metrics.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | ======= 3 | 4 | The package `shorttext` provides a few metrics that measure the distances of some kind. They are all 5 | under :module:`shorttext.metrics`. The soft Jaccard score is based on spellings, and the Word Mover's 6 | distance (WMD) embedded word vectors. 7 | 8 | Edit Distance and Soft Jaccard Score 9 | ------------------------------------ 10 | 11 | Edit distance, or Damerau-Levenshtein distance, measures the differences 12 | between two words due to insertion, deletion, transposition, substitution etc. 13 | Each of this change causes a distance of 1. The algorithm was written in C. 14 | 15 | First import the package: 16 | 17 | >>> from shorttext.metrics.dynprog.dldist import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score 18 | >>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score 19 | >>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score 20 | 21 | The distance can be calculated by: 22 | 23 | >>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 24 | >>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 25 | >>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 26 | >>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 27 | 28 | The longest common prefix finds the length of common prefix: 29 | 30 | >>> longest_common_prefix('topology', 'topological') # gives 7 31 | >>> longest_common_prefix('police', 'policewoman') # gives 6 32 | 33 | The similarity between words is defined as the larger of the following: 34 | 35 | :math:`s = 1 - \frac{\text{DL distance}}{\max( \text(len(word1)), \text(len(word2)) )}` 36 | and 37 | :math:`s = \frac{\text{longest common prefix}}{\max( \text(len(word1)), \text(len(word2)) )}` 38 | 39 | >>> similarity('topology', 'topological') # gives 0.6363636363636364 40 | >>> similarity('book', 'blok') # gives 0.75 41 | 42 | Given the similarity, we say that the intersection, for example, between 'book' and 'blok', has 0.75 elements, or the 43 | union has 1.25 elements. Then the similarity between two sets of tokens can be measured using Jaccard index, with this 44 | "soft" numbers of intersection. Therefore, 45 | 46 | >>> soft_jaccard_score(['book', 'seller'], ['blok', 'sellers']) # gives 0.6716417910447762 47 | >>> soft_jaccard_score(['police', 'station'], ['policeman']) # gives 0.2857142857142858 48 | 49 | The functions `damerau_levenshtein` and `longest_common_prefix` are implemented using Cython_ . 50 | (Before release 0.7.2, they were interfaced to Python using SWIG_ (Simplified Wrapper and Interface Generator)). 51 | 52 | 53 | .. automodule:: shorttext.metrics.dynprog.jaccard 54 | :members: similarity, soft_jaccard_score 55 | 56 | 57 | Word Mover's Distance 58 | --------------------- 59 | 60 | Unlike soft Jaccard score that bases similarity on the words' spellings, Word Mover's distance (WMD) 61 | the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein 62 | distance. The calculation of WMD in this package is based on linear programming, and the distance between 63 | words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. 64 | 65 | Import the modules, and load the word-embedding models: 66 | 67 | >>> from shorttext.metrics.wasserstein import word_mover_distance 68 | >>> from shorttext.utils import load_word2vec_model 69 | >>> wvmodel = load_word2vec_model('/path/to/model_file.bin') 70 | 71 | Examples: 72 | 73 | >>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 74 | >>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 75 | 76 | More examples can be found in this `IPython Notebook 77 | `_ . 78 | 79 | In `gensim`, the Word2Vec model allows the calculation of WMD if user installed the package PyEMD_. It is based on the 80 | scale invariant feature transform (SIFT), an algorithm for EMD based on L1-distance (Manhattan distance). 81 | For more details, 82 | please refer to their `tutorial 83 | `_ , and cite the two papers by Ofir Pele and Michael Werman 84 | if it is used. 85 | 86 | .. automodule:: shorttext.metrics.wasserstein.wordmoverdist 87 | :members: word_mover_distance 88 | 89 | Jaccard Index Due to Cosine Distances 90 | ------------------------------------- 91 | 92 | In the above section of edit distance, the Jaccard score was calculated by considering soft membership 93 | using spelling. However, we can also compute the soft membership by cosine similarity with 94 | 95 | >>> from shorttext.utils import load_word2vec_model 96 | >>> wvmodel = load_word2vec_model('/path/to/model_file.bin') 97 | >>> from shorttext.metrics.embedfuzzy import jaccardscore_sents 98 | 99 | For example, the number of words between the set containing 'doctor' and that containing 'physician' 100 | is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is 101 | 102 | :math:`0.78060223420956831 / (2-0.78060223420956831) = 0.6401538990056869` 103 | 104 | And it can be seen by running it: 105 | 106 | >>> jaccardscore_sents('doctor', 'physician', wvmodel) # gives 0.6401538990056869 107 | >>> jaccardscore_sents('chief executive', 'computer cluster', wvmodel) # gives 0.0022515450768836143 108 | >>> jaccardscore_sents('topological data', 'data of topology', wvmodel) # gives 0.67588977344632573 109 | 110 | .. automodule:: shorttext.metrics.embedfuzzy.jaccard 111 | :members: 112 | 113 | 114 | BERTScore 115 | --------- 116 | 117 | BERTScore includes a category of metrics that is based on BERT model. 118 | This metrics measures the similarity between sentences. To use it, 119 | 120 | >>> from shorttext.metrics.transformers import BERTScorer 121 | >>> scorer = BERTScorer() # using default BERT model and tokenizer 122 | >>> scorer.recall_bertscore('The weather is cold.', 'It is freezing.') # 0.7223385572433472 123 | >>> scorer.precision_bertscore('The weather is cold.', 'It is freezing.') # 0.7700849175453186 124 | >>> scorer.f1score_bertscore('The weather is cold.', 'It is freezing.') # 0.7454479746418043 125 | 126 | For BERT models, please refer to :doc:`tutorial_wordembed` for more details. 127 | 128 | .. automodule:: shorttext.metrics.transformers.bertscore 129 | :members: 130 | 131 | Reference 132 | --------- 133 | 134 | "Damerau-Levenshtein Distance." [`Wikipedia 135 | `_] 136 | 137 | "Jaccard index." [`Wikipedia 138 | `_] 139 | 140 | Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE 141 | `_] 142 | 143 | Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). 144 | 145 | Ofir Pele, Michael Werman, "A linear time histogram metric for improved SIFT matching," *Computer Vision - ECCV 2008*, 495-508 (2008). [`ACM 146 | `_] 147 | 148 | Ofir Pele, Michael Werman, "Fast and robust earth mover's distances," *Proc. 2009 IEEE 12th Int. Conf. on Computer Vision*, 460-467 (2009). [`IEEE 149 | `_] 150 | 151 | Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, Yoav Artzi, 152 | "BERTScore: Evaluating Text Generation with BERT," arXiv:1904.09675 (2019). [`arXiv 153 | `_] 154 | 155 | "Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress 156 | `_] 157 | 158 | 159 | Home: :doc:`index` 160 | 161 | .. _SWIG: http://www.swig.org/ 162 | .. _PyEMD: https://github.com/wmayner/pyemd 163 | .. _Cython: http://cython.org/ -------------------------------------------------------------------------------- /docs/tutorial_nnlib.rst: -------------------------------------------------------------------------------- 1 | Deep Neural Networks with Word-Embedding 2 | ======================================== 3 | 4 | Wrapper for Neural Networks for Word-Embedding Vectors 5 | ------------------------------------------------------ 6 | 7 | In this package, there is a class that serves a wrapper for various neural network algorithms 8 | for supervised short text categorization: 9 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`. 10 | Each class label has a few short sentences, where each token is converted 11 | to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). 12 | The sentences are represented by a matrix, or rank-2 array. 13 | The type of neural network has to be passed when training, and it has to be of 14 | type :class:`keras.models.Sequential`. The number of outputs of the models has to match 15 | the number of class labels in the training data. 16 | To perform prediction, the input short sentences is converted to a unit vector 17 | in the same way. The score is calculated according to the trained neural network model. 18 | 19 | Some of the neural networks can be found within the module :module:`shorttext.classifiers.embed.nnlib.frameworks` 20 | and they are good for short text or document classification. Of course, users can supply their 21 | own neural networks, written in `keras`. 22 | 23 | A pre-trained Google Word2Vec model can be downloaded `here 24 | `_, 25 | and a pre-trained Facebook FastText model can be downloaded `here 26 | `_. 27 | 28 | 29 | See: :doc:`tutorial_wordembed` . 30 | 31 | Import the package: 32 | 33 | >>> import shorttext 34 | 35 | To load the Word2Vec model, 36 | 37 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') 38 | 39 | Then load the training data 40 | 41 | >>> trainclassdict = shorttext.data.subjectkeywords() 42 | 43 | Then we choose a neural network. We choose ConvNet: 44 | 45 | >>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300) 46 | 47 | Initialize the classifier: 48 | 49 | >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel) 50 | 51 | .. autoclass:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification.VarNNEmbeddedVecClassifier 52 | :members: 53 | 54 | 55 | Then train the classifier: 56 | 57 | >>> classifier.train(trainclassdict, kmodel) 58 | Epoch 1/10 59 | 45/45 [==============================] - 0s - loss: 1.0578 60 | Epoch 2/10 61 | 45/45 [==============================] - 0s - loss: 0.5536 62 | Epoch 3/10 63 | 45/45 [==============================] - 0s - loss: 0.3437 64 | Epoch 4/10 65 | 45/45 [==============================] - 0s - loss: 0.2282 66 | Epoch 5/10 67 | 45/45 [==============================] - 0s - loss: 0.1658 68 | Epoch 6/10 69 | 45/45 [==============================] - 0s - loss: 0.1273 70 | Epoch 7/10 71 | 45/45 [==============================] - 0s - loss: 0.1052 72 | Epoch 8/10 73 | 45/45 [==============================] - 0s - loss: 0.0961 74 | Epoch 9/10 75 | 45/45 [==============================] - 0s - loss: 0.0839 76 | Epoch 10/10 77 | 45/45 [==============================] - 0s - loss: 0.0743 78 | 79 | Then the model is ready for classification, like: 80 | 81 | >>> classifier.score('artificial intelligence') 82 | {'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325} 83 | 84 | The trained model can be saved: 85 | 86 | >>> classifier.save_compact_model('/path/to/nnlibvec_convnet_subdata.bin') 87 | 88 | To load it, enter: 89 | 90 | >>> classifier2 = shorttext.classifiers.load_varnnlibvec_classifier(wvmodel, '/path/to/nnlibvec_convnet_subdata.bin') 91 | 92 | .. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification 93 | :members: load_varnnlibvec_classifier 94 | 95 | 96 | Provided Neural Networks 97 | ------------------------ 98 | 99 | There are three neural networks available in this package for the use in 100 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`, 101 | and they are available in the module `shorttext.classifiers.frameworks`. 102 | 103 | .. automodule:: shorttext.classifiers.embed.nnlib.frameworks 104 | :members: 105 | 106 | 107 | ConvNet (Convolutional Neural Network) 108 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 109 | 110 | This neural network for supervised learning is using convolutional neural network (ConvNet), 111 | as demonstrated in Kim's paper. 112 | 113 | .. image:: images/nnlib_cnn.png 114 | 115 | The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are: 116 | 117 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` 118 | words, then the empty words will be filled with zero vectors. 119 | 120 | >>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) 121 | 122 | Double ConvNet 123 | ^^^^^^^^^^^^^^ 124 | 125 | This neural network is nothing more than two ConvNet layers. The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are: 126 | 127 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` 128 | words, then the empty words will be filled with zero vectors. 129 | 130 | >>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) 131 | 132 | C-LSTM (Convolutional Long Short-Term Memory) 133 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 134 | 135 | This neural network for supervised learning is using C-LSTM, according to the paper 136 | written by Zhou *et. al.* It is a neural network with ConvNet as the first layer, 137 | and then followed by LSTM (long short-term memory), a type of recurrent neural network (RNN). 138 | 139 | .. image:: images/nnlib_clstm.png 140 | 141 | The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. 142 | 143 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` 144 | words, then the empty words will be filled with zero vectors. 145 | 146 | >>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) 147 | 148 | User-Defined Neural Network 149 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 150 | 151 | Users can define their own neural network for use in the classifier wrapped by 152 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` 153 | as long as the following criteria are met: 154 | 155 | - the input matrix is :class:`numpy.ndarray`, and of shape `(maxlen, vecsize)`, where 156 | `maxlen` is the maximum length of the sentence, and `vecsize` is the number of dimensions 157 | of the embedded vectors. The output is a one-dimensional array, of size equal to 158 | the number of classes provided by the training data. The order of the class labels is assumed 159 | to be the same as the order of the given training data (stored as a Python dictionary). 160 | 161 | Putting Word2Vec Model As an Input Keras Layer (Deprecated) 162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 163 | 164 | This functionality is removed since release 0.5.11, due to the following reasons: 165 | 166 | * `keras` changed its code that produces this bug; 167 | * the layer is consuming memory; 168 | * only Word2Vec is supported; and 169 | * the results are incorrect. 170 | 171 | Reference 172 | --------- 173 | 174 | Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv 175 | `_] 176 | 177 | "CS231n Convolutional Neural Networks for Visual Recognition," Stanford Online Course. [`link 178 | `_] 179 | 180 | Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv 181 | `_] 182 | 183 | Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium 184 | `_] 185 | 186 | Yoon Kim, "Convolutional Neural Networks for Sentence Classification," *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv 187 | `_] 188 | 189 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv 190 | `_] 191 | 192 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_spell.rst: -------------------------------------------------------------------------------- 1 | Spell Correctors 2 | ================ 3 | 4 | This package supports the use of spell correctors, because typos are very common in relatively short text data. 5 | 6 | There are two types of spell correctors provided: the one described by Peter Norvig (using n-grams Bayesian method), 7 | and another by Keisuke Sakaguchi and his colleagues (using semi-character level recurrent neural network). 8 | 9 | >>> import shorttext 10 | 11 | We use the Norvig's training corpus as an example. To load it, 12 | 13 | >>> from urllib.request import urlopen 14 | >>> text = urlopen('https://norvig.com/big.txt').read() 15 | 16 | The developer just has to instantiate the spell corrector, and then train it with a corpus to get a correction model. 17 | Then one can use it for correction. 18 | 19 | Norvig 20 | ------ 21 | 22 | Peter Norvig described a spell corrector based on Bayesian approach and edit distance. You can refer to his blog for 23 | more information. 24 | 25 | >>> norvig_corrector = shorttext.spell.NorvigSpellCorrector() 26 | >>> norvig_corrector.train(text) 27 | >>> norvig_corrector.correct('oranhe') # gives "orange" 28 | 29 | .. automodule:: shorttext.spell.norvig 30 | :members: 31 | 32 | 33 | 34 | Sakaguchi (SCRNN - semi-character recurrent neural network) 35 | ----------------------------------------------------------- 36 | 37 | Keisuke Sakaguchi and his colleagues developed this spell corrector with the insight that most of the typos happen 38 | in between the spellings. They developed a recurrent neural network that trains possible change within the spellings. There are 39 | six modes: 40 | 41 | - JUMBLE-WHOLE 42 | - JUMBLE-BEG 43 | - JUMBLE-END 44 | - JUMBLE-INT 45 | - NOISE-INSERT 46 | - NOISE-DELETE 47 | - NOISE-REPLACE 48 | 49 | The original intent of their work was not to invent a new spell corrector but to study the "Cmabrigde Uinervtisy" effect, 50 | but it is nice to see how it can be implemented as a spell corrector. 51 | 52 | >>> scrnn_corrector = shorttext.spell.SCRNNSpellCorrector('JUMBLE-WHOLE') 53 | >>> scrnn_corrector.train(text) 54 | >>> scrnn_corrector.correct('oranhe') # gives "orange" 55 | 56 | We can persist the SCRNN corrector for future use: 57 | 58 | >>> scrnn_corrector.save_compact_model('/path/to/spellscrnn.bin') 59 | 60 | To load, 61 | 62 | >>> corrector = shorttext.spell.loadSCRNNSpellCorrector('/path/to/spellscrnn.bin') 63 | 64 | .. automodule:: shorttext.spell.sakaguchi 65 | :members: 66 | 67 | 68 | Reference 69 | --------- 70 | 71 | Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv 72 | `_] 73 | 74 | Peter Norvig, "How to write a spell corrector." (2016) [`Norvig 75 | `_] 76 | -------------------------------------------------------------------------------- /docs/tutorial_stacking.rst: -------------------------------------------------------------------------------- 1 | Stacked Generalization 2 | ====================== 3 | 4 | "Stacking generates the members of the stacking ensemble using several learning algorithms and subsequently 5 | uses another algorithm to learn how to combine their outputs." It combines the classification results 6 | of several classifiers, and combines them. 7 | 8 | Stacking is most commonly implemented using logistic regression. 9 | Suppose there are *K* classifiers, and *l* output labels. Then the stacking generalization 10 | is this logistic model: 11 | 12 | :math:`P ( y=c | x) = \frac{1}{\exp\left( - \sum_{k=1}^{K} w_{kc} x_{kc} + b_c \right) + 1}` 13 | 14 | Here we demonstrate the use of stacking of two classifiers. 15 | 16 | Import the package, and employ the subject dataset as the training dataset. 17 | 18 | >>> import shorttext 19 | >>> subdict = shorttext.data.subjectkeywords() 20 | 21 | Train a C-LSTM model. 22 | 23 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') 24 | >>> clstm_nnet = shorttext.classifiers.frameworks.CLSTMWordEmbed(len(subdict)) 25 | >>> clstm_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel) 26 | >>> clstm_classifier.train(subdict, clstm_nnet) 27 | 28 | A test of its classification: 29 | 30 | >>> clstm_classifier.score('linear algebra') 31 | {'mathematics': 1.0, 'physics': 3.3643366e-10, 'theology': 1.0713742e-13} 32 | >>> clstm_classifier.score('topological soliton') 33 | {'mathematics': 2.0036438e-11, 'physics': 1.0, 'theology': 4.4903334e-14} 34 | 35 | And we train an SVM, with topic vectors as the input vectors. The topic model is LDA with 128 topics. 36 | 37 | >>> # train the LDA topic model 38 | >>> lda128 = shorttext.classifiers.LDAModeler() 39 | >>> lda128.train(subdict, 128) 40 | >>> # train the SVM classifier 41 | >>> from sklearn.svm import SVC 42 | >>> lda128_svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, SVC()) 43 | >>> lda128_svm_classifier.train(subdict) 44 | 45 | A test of its classification: 46 | 47 | >>> lda128_svm_classifier.score('linear algebra') 48 | {'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0} 49 | >>> lda128_svm_classifier.score('topological soliton') 50 | {'mathematics': 0.0, 'physics': 1.0, 'theology': 0.0} 51 | 52 | Then we can implement the stacked generalization using logistic regression by calling: 53 | 54 | >>> stacker = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier}) 55 | >>> stacker.train(subdict) 56 | 57 | Now the model is ready. As a result, we can do the stacked classification: 58 | 59 | >>> stacker.score('linear algebra') 60 | {'mathematics': 0.55439126, 'physics': 0.036988281, 'theology': 0.039665185} 61 | >>> stacker.score('quantum mechanics') 62 | {'mathematics': 0.059210967, 'physics': 0.55031472, 'theology': 0.04532773} 63 | >>> stacker.score('topological dynamics') 64 | {'mathematics': 0.17244603, 'physics': 0.19720334, 'theology': 0.035309207} 65 | >>> stacker.score('christology') 66 | {'mathematics': 0.094574735, 'physics': 0.053406414, 'theology': 0.3797417} 67 | 68 | The stacked generalization can be saved by calling: 69 | 70 | >>> stacker.save_compact_model('/path/to/logitmodel.bin') 71 | 72 | This only saves the stacked generalization model, but not the intermediate classifiers. 73 | The reason for this is for allowing flexibility for users to supply their own algorithms, 74 | as long as they have the `score` functions which output the same way as the classifiers 75 | offered in this package. To load them, initialize it in the same way: 76 | 77 | >>> stacker2 = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier}) 78 | >>> stacker2.load_compact_model('/path/to/logitmodel.bin') 79 | 80 | 81 | .. automodule:: shorttext.stack.stacking 82 | :members: 83 | 84 | 85 | Reference 86 | --------- 87 | 88 | "Combining the Best of All Worlds," *Everything About Data Analytics*, WordPress (2016). [`WordPress 89 | `_] 90 | 91 | David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992). 92 | 93 | M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization," 94 | *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015). 95 | 96 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_sumvec.rst: -------------------------------------------------------------------------------- 1 | Word-Embedding Cosine Similarity Classifier 2 | =========================================== 3 | 4 | Sum of Embedded Vectors 5 | ----------------------- 6 | 7 | Given a pre-trained word-embedding models like Word2Vec, a classifier 8 | based on cosine similarities can be built, which is 9 | :class:`shorttext.classifiers.SumEmbeddedVecClassifier`. 10 | In training the data, 11 | the embedded vectors in every word in that class are averaged. The 12 | score for a given text to each class is the cosine similarity between the averaged 13 | vector of the given text and the precalculated vector of that class. 14 | 15 | A pre-trained Google Word2Vec model can be downloaded `here 16 | `_. 17 | 18 | See: :doc:`tutorial_wordembed` . 19 | 20 | Import the package: 21 | 22 | >>> import shorttext 23 | 24 | To load the Word2Vec model, 25 | 26 | >>> from shorttext.utils import load_word2vec_model 27 | >>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') 28 | 29 | Then we load a set of data: 30 | 31 | >>> nihtraindata = shorttext.data.nihreports(sample_size=None) 32 | 33 | Then initialize the classifier: 34 | 35 | >>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) 36 | >>> classifier.train(nihtraindata) 37 | 38 | This classifier takes relatively little time to train compared with others 39 | in this package. Then we can perform classification: 40 | 41 | >>> classifier.score('bioinformatics') 42 | 43 | Or the result can be sorted and only the five top-scored results are displayed: 44 | 45 | >>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] 46 | [('NIGMS', 0.44962596182682935), 47 | ('NIAID', 0.4494126990050461), 48 | ('NINDS', 0.43435236806719524), 49 | ('NIDCR', 0.43042338197002483), 50 | ('NHGRI', 0.42878346869968731)] 51 | >>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] 52 | [('NHGRI', 0.54200061864847038), 53 | ('NCATS', 0.49097267547279988), 54 | ('NIGMS', 0.47818129591411118), 55 | ('CIT', 0.46874987052158501), 56 | ('NLM', 0.46869259072562974)] 57 | >>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] 58 | [('NCI', 0.53734097785976076), 59 | ('NIAID', 0.50616582142027433), 60 | ('NIDCR', 0.48596330887674788), 61 | ('NIDDK', 0.46875755765903215), 62 | ('NCCAM', 0.4642233792198418)] 63 | 64 | The trained model can be saved: 65 | 66 | >>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') 67 | 68 | And with the same pre-trained Word2Vec model, this classifier can be loaded: 69 | 70 | >>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') 71 | 72 | .. autoclass:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification.SumEmbeddedVecClassifier 73 | :members: 74 | 75 | 76 | Appendix: Model I/O in Previous Versions 77 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 78 | 79 | In previous versions of `shorttext`, :class:`shorttext.classifiers.SumEmbeddedVecClassifier` has a `savemodel` method, 80 | which runs as follow: 81 | 82 | >>> classifier.savemodel('/path/to/nihdata') 83 | 84 | This produces the following file for this model: 85 | 86 | :: 87 | 88 | /path/to/nihdata_embedvecdict.pkl 89 | 90 | It can be loaded by: 91 | 92 | >>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/nihdata', compact=False) 93 | 94 | Reference 95 | --------- 96 | 97 | Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs 98 | `_] 99 | 100 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_textpreprocessing.rst: -------------------------------------------------------------------------------- 1 | Text Preprocessing 2 | ================== 3 | 4 | Standard Preprocessor 5 | --------------------- 6 | 7 | When the bag-of-words (BOW) model is used to represent the content, it is essential to 8 | specify how the text is preprocessed before it is passed to the trainers or the 9 | classifiers. 10 | 11 | This package provides a standard way of text preprocessing, which goes through the 12 | following steps: 13 | 14 | - removing special characters, 15 | - removing numerals, 16 | - converting all alphabets to lower cases, 17 | - removing stop words, and 18 | - stemming the words (using Snowball Porter stemmer). 19 | 20 | To do this, load the preprocesser generator: 21 | 22 | >>> from shorttext.utils import standard_text_preprocessor_1 23 | 24 | Then define the preprocessor, a function, by just calling: 25 | 26 | >>> preprocessor1 = standard_text_preprocessor_1() 27 | 28 | .. automodule:: shorttext.utils.textpreprocessing 29 | :members: standard_text_preprocessor_1 30 | 31 | It is a function that perform the preprocessing in the steps above: 32 | 33 | >>> preprocessor1('Maryland Blue Crab') # output: 'maryland blue crab' 34 | >>> preprocessor1('filing electronic documents and goes home. eat!!!') # output: 'file electron document goe home eat' 35 | 36 | Customized Text Preprocessor 37 | ---------------------------- 38 | 39 | The standard preprocessor is good for many general natural language processing tasks, 40 | but some users may want to define their own preprocessors for their own purposes. 41 | This preprocessor is used in topic modeling, and is desired to be *a function that takes 42 | a string, and returns a string*. 43 | 44 | If the user wants to develop a preprocessor that contains a few steps, he can make it by providing 45 | the pipeline, which is a list of functions that input a string and return a string. For example, 46 | let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; 47 | 2) convert it to upper case; and 3) tag the number of characters after each token. 48 | 49 | Load the function that generates the preprocessor function: 50 | 51 | >>> from shorttext.utils import text_preprocessor 52 | 53 | Initialize a WordNet lemmatizer using `nltk`: 54 | 55 | >>> from nltk.stem import WordNetLemmatizer 56 | >>> lemmatizer = WordNetLemmatizer() 57 | 58 | Define the pipeline. Functions for each of the steps are: 59 | 60 | >>> step1fcn = lambda s: ' '.join([lemmatizer.lemmatize(s1) for s1 in s.split(' ')]) 61 | >>> step2fcn = lambda s: s.upper() 62 | >>> step3fcn = lambda s: ' '.join([s1+'-'+str(len(s1)) for s1 in s.split(' ')]) 63 | 64 | Then the pipeline is: 65 | 66 | >>> pipeline = [step1fcn, step2fcn, step3fcn] 67 | 68 | The preprocessor function can be generated with the defined pipeline: 69 | 70 | >>> preprocessor2 = text_preprocessor(pipeline) 71 | 72 | The function `preprocessor2` is a function that input a string and returns a string. 73 | Some examples are: 74 | 75 | >>> preprocessor2('Maryland blue crab in Annapolis') # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9' 76 | >>> preprocessor2('generative adversarial networks') # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7' 77 | 78 | .. automodule:: shorttext.utils.textpreprocessing 79 | :members: text_preprocessor 80 | 81 | Tokenization 82 | ------------ 83 | 84 | Users are free to choose any tokenizer they wish. In `shorttext`, the tokenizer is 85 | simply the space delimiter, and can be called: 86 | 87 | >>> shorttext.utils.tokenize('Maryland blue crab') # output: ['Maryland', 'blue', 'crab'] 88 | 89 | Reference 90 | --------- 91 | 92 | Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press 93 | `_] 94 | 95 | "R or Python on Text Mining," *Everything About Data Analytics*, WordPress (2015). [`WordPress 96 | `_] 97 | 98 | Home: :doc:`index` -------------------------------------------------------------------------------- /docs/tutorial_wordembed.rst: -------------------------------------------------------------------------------- 1 | Word Embedding Models 2 | ===================== 3 | 4 | Word2Vec 5 | -------- 6 | 7 | The most commonly used word-embedding model is Word2Vec. Its model can be downloaded from 8 | their page. To load the model, call: 9 | 10 | >>> import shorttext 11 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') 12 | 13 | It is a binary file, and the default is set to be `binary=True`. 14 | 15 | .. automodule:: shorttext.utils.wordembed 16 | :members: load_word2vec_model 17 | 18 | It is equivalent to calling, 19 | 20 | >>> import gensim 21 | >>> wvmodel = gensim.models.KeyedVectors.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True) 22 | 23 | 24 | Word2Vec is a neural network model that embeds words into semantic vectors that carry semantic meaning. 25 | It is easy to extract the vector of a word, like for the word 'coffee': 26 | 27 | >>> wvmodel['coffee'] # an ndarray for the word will be output 28 | 29 | One can find the most similar words to 'coffee' according to this model: 30 | 31 | >>> wvmodel.most_similar('coffee') 32 | 33 | which outputs: 34 | 35 | :: 36 | 37 | [(u'coffees', 0.721267819404602), 38 | (u'gourmet_coffee', 0.7057087421417236), 39 | (u'Coffee', 0.6900454759597778), 40 | (u'o_joe', 0.6891065835952759), 41 | (u'Starbucks_coffee', 0.6874972581863403), 42 | (u'coffee_beans', 0.6749703884124756), 43 | (u'latt\xe9', 0.664122462272644), 44 | (u'cappuccino', 0.662549614906311), 45 | (u'brewed_coffee', 0.6621608138084412), 46 | (u'espresso', 0.6616827249526978)] 47 | 48 | Or if you want to find the cosine similarity between 'coffee' and 'tea', enter: 49 | 50 | >>> wvmodel.similarity('coffee', 'tea') # outputs: 0.56352921707810621 51 | 52 | Semantic meaning can be reflected by their differences. For example, we can vaguely 53 | say `Francis` - `Paris` = `Taiwan` - `Taipei`, or `man` - `actor` = `woman` - `actress`. 54 | Define first the cosine similarity for readability: 55 | 56 | >>> from scipy.spatial.distance import cosine 57 | >>> similarity = lambda u, v: 1-cosine(u, v) 58 | 59 | Then 60 | 61 | >>> similarity(wvmodel['France'] + wvmodel['Taipei'] - wvmodel['Taiwan'], wvmodel['Paris']) # outputs: 0.70574580801216202 62 | >>> similarity(wvmodel['woman'] + wvmodel['actor'] - wvmodel['man'], wvmodel['actress']) # outputs: 0.876354245612604 63 | 64 | GloVe 65 | ----- 66 | 67 | Stanford NLP Group developed a similar word-embedding algorithm, with a good theory explaining how 68 | it works. It is extremely similar to Word2Vec. 69 | 70 | One can convert a text-format GloVe model into a text-format Word2Vec model. More information can be found 71 | in the documentation of `gensim`: `Converting GloVe to Word2Vec 72 | `_ 73 | 74 | FastText 75 | -------- 76 | 77 | FastText is a similar word-embedding model from Facebook. You can download pre-trained models here: 78 | 79 | `Pre-trained word vectors 80 | `_ 81 | 82 | To load a pre-trained FastText model, run: 83 | 84 | >>> import shorttext 85 | >>> ftmodel = shorttext.utils.load_fasttext_model('/path/to/model.bin') 86 | 87 | And it is used exactly the same way as Word2Vec. 88 | 89 | .. automodule:: shorttext.utils.wordembed 90 | :members: load_fasttext_model 91 | 92 | Poincaré Embeddings 93 | ------------------- 94 | 95 | Poincaré embeddings is a new embedding that learns both semantic similarity and hierarchical structures. To load a 96 | pre-trained model, run: 97 | 98 | >>> import shorttext 99 | >>> pemodel = shorttext.utils.load_poincare_model('/path/to/model.txt') 100 | 101 | For preloaded word-embedding models, please refer to :doc:`tutorial_wordembed`. 102 | 103 | .. automodule:: shorttext.utils.wordembed 104 | :members: load_poincare_model 105 | 106 | BERT 107 | ---- 108 | 109 | BERT_ (Bidirectional Transformers for Language Understanding) 110 | is a transformer-based language model. This package supports tokens 111 | and sentence embeddings using pre-trained language models, supported 112 | by the package written by HuggingFace_. In `shorttext`, to run: 113 | 114 | >>> from shorttext.utils import WrappedBERTEncoder 115 | >>> encoder = WrappedBERTEncoder() # the default model and tokenizer are loaded 116 | >>> sentences_embedding, tokens_embedding, tokens = encoder.encode_sentences(['The car should turn right.', 'The answer is right.']) 117 | 118 | The third line returns the embeddings of all sentences, embeddings of all tokens in each sentence, 119 | and the tokens (with `CLS` and `SEP`) included. Unlike previous embeddings, 120 | token embeddings depend on the context; in the above example, the embeddings of the 121 | two "right"'s are different as they have different meanings. 122 | 123 | The default BERT models and tokenizers are `bert-base_uncase`. 124 | If you want to use others, refer to `HuggingFace's model list 125 | `_ . 126 | 127 | .. autoclass:: shorttext.utils.transformers.BERTObject 128 | :members: 129 | 130 | .. autoclass:: shorttext.utils.transformers.WrappedBERTEncoder 131 | :members: 132 | 133 | 134 | Other Functions 135 | --------------- 136 | 137 | .. automodule:: shorttext.utils.wordembed 138 | :members: shorttext_to_avgvec 139 | 140 | 141 | Links 142 | ----- 143 | 144 | - Word2Vec_ 145 | - GloVe_ 146 | - FastText_ 147 | - BERT_ 148 | - HuggingFace_ 149 | 150 | Reference 151 | --------- 152 | 153 | Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv 154 | `_] 155 | 156 | Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe 157 | `_] 158 | 159 | Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF 160 | `_] 161 | 162 | Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv 163 | `_] 164 | 165 | Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv 166 | `_] 167 | 168 | Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv 169 | `_] 170 | 171 | Radim Řehůřek, "Making sense of word2vec," RaRe Technologies (2014). [`RaRe 172 | `_] 173 | 174 | "Probabilistic Theory of Word Embeddings: GloVe," *Everything About Data Analytics*, WordPress (2016). [`WordPress 175 | `_] 176 | 177 | "Toying with Word2Vec," *Everything About Data Analytics*, WordPress (2015). [`WordPress 178 | `_] 179 | 180 | "Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress 181 | `_] 182 | 183 | Home: :doc:`index` 184 | 185 | .. _Word2Vec: https://code.google.com/archive/p/word2vec/ 186 | .. _GloVe: http://nlp.stanford.edu/projects/glove/ 187 | .. _FastText: https://github.com/facebookresearch/fastText 188 | .. _BERT: https://arxiv.org/abs/1810.04805 189 | .. _HuggingFace: https://huggingface.co/ -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "shorttext" 7 | version = "2.2.1" 8 | authors = [ 9 | {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"} 10 | ] 11 | description = "Short Text Mining" 12 | readme = {file = "README.md", content-type = "text/markdown"} 13 | license = {text = "MIT"} 14 | keywords = ["shorttext", "natural language processing", "text mining"] 15 | requires-python = ">=3.9" 16 | classifiers = [ 17 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 18 | "Topic :: Scientific/Engineering :: Mathematics", 19 | "Topic :: Text Processing :: Linguistic", 20 | "Topic :: Software Development :: Libraries :: Python Modules", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | "Natural Language :: English", 26 | "License :: OSI Approved :: MIT License", 27 | "Intended Audience :: Developers", 28 | "Intended Audience :: Education", 29 | "Intended Audience :: Information Technology", 30 | "Intended Audience :: Science/Research" 31 | ] 32 | dependencies = [ 33 | "numpy>=1.23.3", 34 | "scipy>=1.12.0", 35 | "joblib>=1.3.0", 36 | "scikit-learn>=1.2.0", 37 | "tensorflow>=2.13.0", 38 | "keras>=2.13.0", 39 | "gensim>=4.0.0", 40 | "pandas>=1.2.0", 41 | "snowballstemmer>=3.0.0", 42 | "transformers>=4.39.0", 43 | "torch>=2.0.0", 44 | "numba>=0.57.0", 45 | "deprecation>=2.0.0" 46 | ] 47 | 48 | [project.urls] 49 | Repository = "https://github.com/stephenhky/PyShortTextCategorization" 50 | Issues = "https://github.com/stephenhky/PyShortTextCategorization/issues" 51 | Documentation = "https://shorttext.readthedocs.io" 52 | 53 | [tool.setuptools] 54 | packages = [ 55 | "shorttext", 56 | "shorttext.cli", 57 | "shorttext.utils", 58 | "shorttext.classifiers", 59 | "shorttext.classifiers.embed", 60 | "shorttext.classifiers.embed.nnlib", 61 | "shorttext.classifiers.embed.sumvec", 62 | "shorttext.classifiers.bow", 63 | "shorttext.classifiers.bow.topic", 64 | "shorttext.classifiers.bow.maxent", 65 | "shorttext.data", 66 | "shorttext.stack", 67 | "shorttext.generators", 68 | "shorttext.generators.bow", 69 | "shorttext.generators.charbase", 70 | "shorttext.generators.seq2seq", 71 | "shorttext.metrics", 72 | "shorttext.metrics.dynprog", 73 | "shorttext.metrics.wasserstein", 74 | "shorttext.metrics.transformers", 75 | "shorttext.metrics.embedfuzzy", 76 | "shorttext.spell" 77 | ] 78 | zip-safe = false 79 | 80 | [project.scripts] 81 | ShortTextCategorizerConsole = "shorttext.cli.categorization:main" 82 | ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main" 83 | 84 | [project.optional-dependencies] 85 | test = ["unittest2", "pytest", "simplerepresentations>=0.0.4"] 86 | -------------------------------------------------------------------------------- /shorttext/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import metrics 3 | from . import classifiers 4 | from . import data 5 | from . import generators 6 | from . import spell 7 | from . import stack 8 | from . import utils 9 | -------------------------------------------------------------------------------- /shorttext/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | from .embed import * 2 | from .embed import SumEmbeddedVecClassifier, load_sumword2vec_classifier 3 | from .embed import VarNNEmbeddedVecClassifier, load_varnnlibvec_classifier 4 | from .embed import frameworks 5 | from .embed.sumvec import frameworks as sumvecframeworks 6 | 7 | from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier 8 | from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier 9 | from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier 10 | 11 | from .bow.topic.SkLearnClassification import TopicVectorSkLearnClassifier 12 | from .bow.topic.SkLearnClassification import train_gensim_topicvec_sklearnclassifier, train_autoencoder_topic_sklearnclassifier 13 | from .bow.topic.SkLearnClassification import load_gensim_topicvec_sklearnclassifier, load_autoencoder_topic_sklearnclassifier 14 | 15 | from .bow.maxent.MaxEntClassification import MaxEntClassifier, load_maxent_classifier -------------------------------------------------------------------------------- /shorttext/classifiers/bow/__init__.py: -------------------------------------------------------------------------------- 1 | from . import topic 2 | from . import maxent -------------------------------------------------------------------------------- /shorttext/classifiers/bow/maxent/__init__.py: -------------------------------------------------------------------------------- 1 | from . import MaxEntClassification -------------------------------------------------------------------------------- /shorttext/classifiers/bow/topic/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import TopicVectorDistanceClassification 3 | from . import SkLearnClassification -------------------------------------------------------------------------------- /shorttext/classifiers/embed/__init__.py: -------------------------------------------------------------------------------- 1 | from . import nnlib 2 | from . import sumvec 3 | 4 | from .nnlib import frameworks 5 | from .nnlib.VarNNEmbedVecClassification import VarNNEmbeddedVecClassifier 6 | from .nnlib.VarNNEmbedVecClassification import load_varnnlibvec_classifier 7 | from .nnlib.frameworks import CNNWordEmbed, DoubleCNNWordEmbed, CLSTMWordEmbed 8 | from .sumvec.frameworks import DenseWordEmbed 9 | from .sumvec.SumEmbedVecClassification import SumEmbeddedVecClassifier 10 | from .sumvec.SumEmbedVecClassification import load_sumword2vec_classifier 11 | from .sumvec.VarNNSumEmbedVecClassification import VarNNSumEmbeddedVecClassifier 12 | -------------------------------------------------------------------------------- /shorttext/classifiers/embed/nnlib/__init__.py: -------------------------------------------------------------------------------- 1 | from . import VarNNEmbedVecClassification 2 | from . import frameworks -------------------------------------------------------------------------------- /shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | from scipy.spatial.distance import cosine 6 | 7 | from ....utils.classification_exceptions import ModelNotTrainedException 8 | from ....utils import shorttext_to_avgvec 9 | from ....utils.compactmodel_io import CompactIOMachine 10 | 11 | 12 | class SumEmbeddedVecClassifier(CompactIOMachine): 13 | """ 14 | This is a supervised classification algorithm for short text categorization. 15 | Each class label has a few short sentences, where each token is converted 16 | to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). 17 | They are then summed up and normalized to a unit vector for that particular class labels. 18 | To perform prediction, the input short sentences is converted to a unit vector 19 | in the same way. The similarity score is calculated by the cosine similarity. 20 | 21 | A pre-trained Google Word2Vec model can be downloaded `here 22 | `_. 23 | """ 24 | 25 | def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)): 26 | """ Initialize the classifier. 27 | 28 | :param wvmodel: Word2Vec model 29 | :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) 30 | :param simfcn: similarity function (Default: cosine similarity) 31 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 32 | :type vecsize: int 33 | :type simfcn: function 34 | """ 35 | CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl']) 36 | self.wvmodel = wvmodel 37 | self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize 38 | self.simfcn = simfcn 39 | self.trained = False 40 | 41 | def train(self, classdict): 42 | """ Train the classifier. 43 | 44 | If this has not been run, or a model was not loaded by :func:`~loadmodel`, 45 | a `ModelNotTrainedException` will be raised while performing prediction or saving 46 | the model. 47 | 48 | :param classdict: training data 49 | :return: None 50 | :type classdict: dict 51 | """ 52 | self.addvec = defaultdict(lambda : np.zeros(self.vecsize)) 53 | for classtype in classdict: 54 | self.addvec[classtype] = np.sum([self.shorttext_to_embedvec(shorttext) 55 | for shorttext in classdict[classtype]], 56 | axis=0) 57 | self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype]) 58 | self.addvec = dict(self.addvec) 59 | self.trained = True 60 | 61 | def savemodel(self, nameprefix): 62 | """ Save the trained model into files. 63 | 64 | Given the prefix of the file paths, save the model into files, with name given by the prefix, 65 | and add "_embedvecdict.pickle" at the end. If there is no trained model, a `ModelNotTrainedException` 66 | will be thrown. 67 | 68 | :param nameprefix: prefix of the file path 69 | :return: None 70 | :type nameprefix: str 71 | :raise: ModelNotTrainedException 72 | """ 73 | if not self.trained: 74 | raise ModelNotTrainedException() 75 | pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'wb')) 76 | 77 | def loadmodel(self, nameprefix): 78 | """ Load a trained model from files. 79 | 80 | Given the prefix of the file paths, load the model from files with name given by the prefix 81 | followed by "_embedvecdict.pickle". 82 | 83 | If this has not been run, or a model was not trained by :func:`~train`, 84 | a `ModelNotTrainedException` will be raised while performing prediction and saving the model. 85 | 86 | :param nameprefix: prefix of the file path 87 | :return: None 88 | :type nameprefix: str 89 | """ 90 | self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'rb')) 91 | self.trained = True 92 | 93 | def shorttext_to_embedvec(self, shorttext): 94 | """ Convert the short text into an averaged embedded vector representation. 95 | 96 | Given a short sentence, it converts all the tokens into embedded vectors according to 97 | the given word-embedding model, sums 98 | them up, and normalize the resulting vector. It returns the resulting vector 99 | that represents this short sentence. 100 | 101 | :param shorttext: a short sentence 102 | :return: an embedded vector that represents the short sentence 103 | :type shorttext: str 104 | :rtype: numpy.ndarray 105 | """ 106 | return shorttext_to_avgvec(shorttext, self.wvmodel) 107 | 108 | def score(self, shorttext): 109 | """ Calculate the scores for all the class labels for the given short sentence. 110 | 111 | Given a short sentence, calculate the classification scores for all class labels, 112 | returned as a dictionary with key being the class labels, and values being the scores. 113 | If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. 114 | 115 | If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. 116 | 117 | :param shorttext: a short sentence 118 | :return: a dictionary with keys being the class labels, and values being the corresponding classification scores 119 | :type shorttext: str 120 | :rtype: dict 121 | :raise: ModelNotTrainedException 122 | """ 123 | if not self.trained: 124 | raise ModelNotTrainedException() 125 | vec = self.shorttext_to_embedvec(shorttext) 126 | scoredict = {} 127 | for classtype in self.addvec: 128 | try: 129 | scoredict[classtype] = self.simfcn(vec, self.addvec[classtype]) 130 | except ValueError: 131 | scoredict[classtype] = np.nan 132 | return scoredict 133 | 134 | 135 | def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None): 136 | """ Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. 137 | 138 | :param wvmodel: Word2Vec model 139 | :param name: name (if compact=True) or prefix (if compact=False) of the file path 140 | :param compact whether model file is compact (Default: True) 141 | :param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model) 142 | :return: the classifier 143 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 144 | :type name: str 145 | :type compact: bool 146 | :type vecsize: int 147 | :rtype: SumEmbeddedVecClassifier 148 | """ 149 | classifier = SumEmbeddedVecClassifier(wvmodel, vecsize=vecsize) 150 | if compact: 151 | classifier.load_compact_model(name) 152 | else: 153 | classifier.loadmodel(name) 154 | return classifier -------------------------------------------------------------------------------- /shorttext/classifiers/embed/sumvec/__init__.py: -------------------------------------------------------------------------------- 1 | from . import SumEmbedVecClassification 2 | from . import VarNNSumEmbedVecClassification 3 | from . import frameworks -------------------------------------------------------------------------------- /shorttext/classifiers/embed/sumvec/frameworks.py: -------------------------------------------------------------------------------- 1 | 2 | from tensorflow.keras.layers import Dense, Activation 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.regularizers import l2 5 | 6 | from ....utils.classification_exceptions import UnequalArrayLengthsException 7 | 8 | 9 | def DenseWordEmbed(nb_labels, 10 | dense_nb_nodes=[], 11 | dense_actfcn=[], 12 | vecsize=300, 13 | reg_coef=0.1, 14 | final_activiation='softmax', 15 | optimizer='adam'): 16 | """ Return layers of dense neural network. 17 | 18 | Return layers of dense neural network. This assumes the input to be a rank-1 vector. 19 | 20 | :param nb_labels: number of class labels 21 | :param dense_nb_nodes: number of nodes in each later (Default: []) 22 | :param dense_actfcn: activation functions for each layer (Default: []) 23 | :param vecsize: length of the embedded vectors in the model (Default: 300) 24 | :param reg_coef: regularization coefficient (Default: 0.1) 25 | :param final_activiation: activation function of the final layer (Default: softmax) 26 | :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) 27 | :return: keras sequential model for dense neural network 28 | :type nb_labels: int 29 | :type dense_nb_nodes: list 30 | :type dense_actfcn: list 31 | :type vecsize: int 32 | :type reg_coef: float 33 | :type final_activiation: str 34 | :type optimizer: str 35 | :rtype: keras.models.Model 36 | """ 37 | if len(dense_nb_nodes)!=len(dense_actfcn): 38 | raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn) 39 | nb_layers = len(dense_nb_nodes) 40 | 41 | model = Sequential() 42 | if nb_layers==0: 43 | model.add(Dense(nb_labels, input_shape=(vecsize,), kernel_regularizer=l2(reg_coef))) 44 | else: 45 | model.add(Dense(dense_nb_nodes[0], 46 | input_shape=(vecsize,), 47 | activation=dense_actfcn[0], 48 | kernel_regularizer=l2(reg_coef)) 49 | ) 50 | for nb_nodes, activation in zip(dense_nb_nodes[1:], dense_actfcn[1:]): 51 | model.add(Dense(nb_nodes, activation=activation, kernel_regularizer=l2(reg_coef))) 52 | model.add(Dense(nb_labels, kernel_regularizer=l2(reg_coef))) 53 | 54 | # final activation layer 55 | model.add(Activation(final_activiation)) 56 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 57 | 58 | return model -------------------------------------------------------------------------------- /shorttext/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/shorttext/cli/__init__.py -------------------------------------------------------------------------------- /shorttext/cli/categorization.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from functools import partial 4 | import argparse 5 | import logging 6 | 7 | from ..utils.compactmodel_io import get_model_classifier_name 8 | from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException 9 | from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model 10 | from ..smartload import smartload_compact_model 11 | from ..classifiers import TopicVectorCosineDistanceClassifier 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | allowed_classifiers = [ 17 | 'ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 18 | 'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent' 19 | ] 20 | needembedded_classifiers = ['nnlibvec', 'sumvec'] 21 | topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder'] 22 | 23 | load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False) 24 | load_poincare_binary_model = partial(load_poincare_model, binary=True) 25 | 26 | typedict = { 27 | 'word2vec': load_word2vec_model, 28 | 'word2vec_nonbinary': load_word2vec_nonbinary_model, 29 | 'fasttext': load_fasttext_model, 30 | 'poincare': load_poincare_model, 31 | 'poincare_binary': load_poincare_binary_model 32 | } 33 | 34 | 35 | def get_argparser(): 36 | parser = argparse.ArgumentParser( 37 | description='Perform prediction on short text with a given trained model.' 38 | ) 39 | parser.add_argument('model_filepath', help='Path of the trained (compact) model.') 40 | parser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model.') 41 | parser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)') 42 | parser.add_argument('--topn', type=int, default=10, help='Number of top results to show.') 43 | parser.add_argument('--inputtext', default=None, help='Single input text for classification. If omitted, will enter console mode.') 44 | parser.add_argument('--type', default='word2vec', choices=typedict.keys(), 45 | help='Type of word-embedding model (default: word2vec)') 46 | return parser 47 | 48 | # main block 49 | def main(): 50 | # argument parsing 51 | args = get_argparser().parse_args() 52 | 53 | # check if the model file is given 54 | if not os.path.exists(args.model_filepath): 55 | raise IOError(f'Model file "{args.model_filepath}" not found!') 56 | 57 | # get the name of the classifier 58 | logger.info('Retrieving classifier name...') 59 | classifier_name = get_model_classifier_name(args.model_filepath) 60 | 61 | if classifier_name not in allowed_classifiers: 62 | raise AlgorithmNotExistException(classifier_name) 63 | 64 | # load the Word2Vec model if necessary 65 | wvmodel = None 66 | if classifier_name in needembedded_classifiers: 67 | # check if the word embedding model is available 68 | if not os.path.exists(args.wv): 69 | raise WordEmbeddingModelNotExistException(args.wv) 70 | # if there, load it 71 | logger.info(f'Loading word-embedding model from {args.wv}...') 72 | wvmodel = typedict[args.type](args.wv) 73 | 74 | # load the classifier 75 | logger.info('Initializing the classifier...') 76 | if classifier_name in topicmodels: 77 | topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) 78 | classifier = TopicVectorCosineDistanceClassifier(topicmodel) 79 | else: 80 | classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) 81 | 82 | # predict single input or run in console mode 83 | if args.inputtext is not None: 84 | if len(args.inputtext.strip()) == 0: 85 | print('No input text provided.') 86 | return 87 | scoredict = classifier.score(args.inputtext) 88 | for label, score in sorted(scoredict.items(), key=lambda x: x[1], reverse=True)[:args.topn]: 89 | print(f'{label} : {score:.4f}') 90 | else: 91 | # Console 92 | print('Enter text to classify (empty input to quit):') 93 | while True: 94 | shorttext = input('text> ').strip() 95 | if not shorttext: 96 | break 97 | scoredict = classifier.score(shorttext) 98 | for label, score in sorted(scoredict.items(), key=lambda x: x[1], reverse=True)[:args.topn]: 99 | print(f'{label} : {score:.4f}') 100 | print('Done.') 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /shorttext/cli/wordembedsim.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import time 4 | 5 | from scipy.spatial.distance import cosine 6 | 7 | from ..metrics.embedfuzzy import jaccardscore_sents 8 | from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model 9 | from ..utils import shorttext_to_avgvec 10 | from ..metrics.wasserstein import word_mover_distance 11 | from ..metrics.dynprog.jaccard import soft_jaccard_score 12 | 13 | 14 | typedict = { 15 | 'word2vec': load_word2vec_model, 16 | 'fasttext': load_fasttext_model, 17 | 'poincare': load_poincare_model 18 | } 19 | 20 | 21 | def getargparser(): 22 | parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.') 23 | parser.add_argument('modelpath', help='Path of the Word2Vec model') 24 | parser.add_argument('--type', default='word2vec', 25 | help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")') 26 | return parser 27 | 28 | 29 | def main(): 30 | # argument parsing 31 | args = getargparser().parse_args() 32 | 33 | # preload tokenizer 34 | tokenize('Mogu is cute.') 35 | 36 | time0 = time.time() 37 | print("Loading "+args.type+" model: "+args.modelpath) 38 | wvmodel = typedict[args.type](args.modelpath) 39 | time1 = time.time() 40 | end = False 41 | print("... loading time: "+str(time1 - time0)+" seconds") 42 | 43 | while not end: 44 | sent1 = input('sent1> ') 45 | if len(sent1)==0: 46 | end = True 47 | else: 48 | sent2 = input('sent2> ') 49 | 50 | # output results 51 | print("Cosine Similarity = %.4f" % (1 - cosine(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel)))) 52 | print("Word-embedding Jaccard Score Similarity = %.4f" % jaccardscore_sents(sent1, sent2, wvmodel)) 53 | print("Word Mover's Distance = %.4f" % word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel)) 54 | print("Soft Jaccard Score (edit distance) = %.4f" % soft_jaccard_score(tokenize(sent1), tokenize(sent2))) 55 | 56 | -------------------------------------------------------------------------------- /shorttext/data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data_retrieval import subjectkeywords, nihreports, inaugural, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts 3 | -------------------------------------------------------------------------------- /shorttext/data/data_retrieval.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | from collections import defaultdict 4 | import json 5 | import os 6 | import zipfile 7 | import sys 8 | import csv 9 | from urllib.request import urlretrieve 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | def retrieve_csvdata_as_dict(filepath): 16 | """ Retrieve the training data in a CSV file. 17 | 18 | Retrieve the training data in a CSV file, with the first column being the 19 | class labels, and second column the text data. It returns a dictionary with 20 | the class labels as keys, and a list of short texts as the value for each key. 21 | 22 | :param filepath: path of the training data (CSV) 23 | :return: a dictionary with class labels as keys, and lists of short texts 24 | :type filepath: str 25 | :rtype: dict 26 | """ 27 | datafile = open(filepath, 'r') 28 | reader = csv.reader(datafile) 29 | headerread = False 30 | shorttextdict = defaultdict(lambda: []) 31 | for label, content in reader: 32 | if headerread: 33 | if isinstance(content, str): 34 | shorttextdict[label] += [content] 35 | else: 36 | headerread = True 37 | return dict(shorttextdict) 38 | 39 | 40 | def retrieve_jsondata_as_dict(filepath): 41 | """ Retrieve the training data in a JSON file. 42 | 43 | Retrieve the training data in a JSON file, with 44 | the class labels as keys, and a list of short texts as the value for each key. 45 | It returns the corresponding dictionary. 46 | 47 | :param filepath: path of the training data (JSON) 48 | :return: a dictionary with class labels as keys, and lists of short texts 49 | :type filepath: str 50 | :rtype: dict 51 | """ 52 | return json.load(open(filepath, 'r')) 53 | 54 | 55 | def subjectkeywords(): 56 | """ Return an example data set of subjects. 57 | 58 | Return an example data set, with three subjects and corresponding keywords. 59 | This is in the format of the training input. 60 | 61 | :return: example data set 62 | :rtype: dict 63 | """ 64 | this_dir, _ = os.path.split(__file__) 65 | return retrieve_csvdata_as_dict(os.path.join(this_dir, 'shorttext_exampledata.csv')) 66 | 67 | 68 | def inaugural(): 69 | """ Return an example dataset, which is the Inaugural Addresses of all Presidents of 70 | the United States from George Washington to Barack Obama. 71 | 72 | Each key is the year, a dash, and the last name of the president. The content is 73 | the list of all the sentences 74 | 75 | :return: example data set 76 | :rtype: dict 77 | """ 78 | zfile = zipfile.ZipFile(get_or_download_data("USInaugural.zip", 79 | "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip", 80 | asbytes=True), 81 | ) 82 | address_jsonstr = zfile.open("addresses.json").read() 83 | zfile.close() 84 | return json.loads(address_jsonstr.decode('utf-8')) 85 | 86 | 87 | def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512): 88 | """ Return an example data set, sampled from NIH RePORT (Research Portfolio 89 | Online Reporting Tools). 90 | 91 | Return an example data set from NIH (National Institutes of Health), 92 | data publicly available from their RePORT 93 | website. (`link 94 | `_). 95 | The data is with `txt_col` being either project titles ('PROJECT_TITLE') 96 | or proposal abstracts ('ABSTRACT_TEXT'), and label_col being the names of the ICs (Institutes or Centers), 97 | with 'IC_NAME' the whole form, and 'FUNDING_ICs' the abbreviated form). 98 | 99 | Dataset directly adapted from the NIH data from `R` package `textmineR 100 | `_. 101 | 102 | :param txt_col: column for the text (Default: 'PROJECT_TITLE') 103 | :param label_col: column for the labels (Default: 'FUNDING_ICs') 104 | :param sample_size: size of the sample. Set to None if all rows. (Default: 512) 105 | :return: example data set 106 | :type txt_col: str 107 | :type label_col: str 108 | :type sample_size: int 109 | :rtype: dict 110 | """ 111 | # validation 112 | # txt_col = 'PROJECT_TITLE' or 'ABSTRACT_TEXT' 113 | # label_col = 'FUNDING_ICs' or 'IC_NAME' 114 | if not (txt_col in ['PROJECT_TITLE', 'ABSTRACT_TEXT']): 115 | raise KeyError('Undefined text column: '+txt_col+'. Must be PROJECT_TITLE or ABSTRACT_TEXT.') 116 | if not (label_col in ['FUNDING_ICs', 'IC_NAME']): 117 | raise KeyError('Undefined label column: '+label_col+'. Must be FUNDING_ICs or IC_NAME.') 118 | 119 | zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip', 120 | 'https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/nih_full.csv.zip', 121 | asbytes=True), 122 | 'r', 123 | zipfile.ZIP_DEFLATED) 124 | nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col], encoding='cp437') 125 | zfile.close() 126 | nb_data = len(nih) 127 | sample_size = nb_data if sample_size==None else min(nb_data, sample_size) 128 | 129 | classdict = defaultdict(lambda : []) 130 | 131 | for rowidx in np.random.randint(nb_data, size=min(nb_data, sample_size)): 132 | label = nih.iloc[rowidx, nih.columns.get_loc(label_col)] 133 | if label_col=='FUNDING_ICs': 134 | if label=='': 135 | label = 'OTHER' 136 | else: 137 | endpos = label.index(':') 138 | label = label[:endpos] 139 | classdict[label] += [nih.iloc[rowidx, nih.columns.get_loc(txt_col)]] 140 | 141 | return dict(classdict) 142 | 143 | 144 | def mergedict(dicts): 145 | """ Merge data dictionary. 146 | 147 | Merge dictionaries of the data in the training data format. 148 | 149 | :param dicts: dicts to merge 150 | :return: merged dict 151 | :type dicts: list 152 | :rtype: dict 153 | """ 154 | mdict = defaultdict(lambda : []) 155 | for thisdict in dicts: 156 | for label in thisdict: 157 | mdict[label] += thisdict[label] 158 | return dict(mdict) 159 | 160 | 161 | def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False): 162 | """ Yielding test data and training data for cross validation by partitioning it. 163 | 164 | Given a training data, partition the data into portions, each will be used as test 165 | data set, while the other training data set. It returns a generator. 166 | 167 | :param classdict: training data 168 | :param nb_partitions: number of partitions 169 | :param shuffle: whether to shuffle the data before partitioning 170 | :return: generator, producing a test data set and a training data set each time 171 | :type classdict: dict 172 | :type nb_partitions: int 173 | :type shuffle: bool 174 | :rtype: generator 175 | """ 176 | crossvaldicts = [] 177 | for _ in range(nb_partitions): 178 | crossvaldicts.append(defaultdict(lambda: [])) 179 | 180 | for label in classdict: 181 | nb_data = len(classdict[label]) 182 | partsize = nb_data / nb_partitions 183 | sentences = classdict[label] if not shuffle else random.shuffle(sentences) 184 | for i in range(nb_partitions): 185 | crossvaldicts[i][label] += sentences[i * partsize:min(nb_data, (i + 1) * partsize)] 186 | crossvaldicts = [dict(crossvaldict) for crossvaldict in crossvaldicts] 187 | 188 | for i in range(nb_partitions): 189 | testdict = crossvaldicts[i] 190 | traindict = mergedict([crossvaldicts[j] for j in range(nb_partitions) if j != i]) 191 | yield testdict, traindict 192 | 193 | 194 | def get_or_download_data(filename, origin, asbytes=False): 195 | # determine path 196 | homedir = os.path.expanduser('~') 197 | datadir = os.path.join(homedir, '.shorttext') 198 | if not os.path.exists(datadir): 199 | os.makedirs(datadir) 200 | 201 | targetfilepath = os.path.join(datadir, filename) 202 | # download if not exist 203 | if not os.path.exists(os.path.join(datadir, filename)): 204 | print('Downloading...') 205 | print('Source: ', origin) 206 | print('Target: ', targetfilepath) 207 | try: 208 | urlretrieve(origin, targetfilepath) 209 | except: 210 | print('Failure to download file!') 211 | print(sys.exc_info()) 212 | os.remove(targetfilepath) 213 | 214 | # return 215 | return open(targetfilepath, 'rb' if asbytes else 'r') 216 | -------------------------------------------------------------------------------- /shorttext/data/shorttext_exampledata.csv: -------------------------------------------------------------------------------- 1 | subject,content 2 | mathematics,linear algebra 3 | mathematics,topology 4 | mathematics,algebra 5 | mathematics,calculus 6 | mathematics,variational calculus 7 | mathematics,functional field 8 | mathematics,real analysis 9 | mathematics,complex analysis 10 | mathematics,differential equation 11 | mathematics,statistics 12 | mathematics,statistical optimization 13 | mathematics,probability 14 | mathematics,stochastic calculus 15 | mathematics,numerical analysis 16 | mathematics,differential geometry 17 | physics,renormalization 18 | physics,classical mechanics 19 | physics,quantum mechanics 20 | physics,statistical mechanics 21 | physics,functional field 22 | physics,path integral 23 | physics,quantum field theory 24 | physics,electrodynamics 25 | physics,condensed matter 26 | physics,particle physics 27 | physics,topological solitons 28 | physics,astrophysics 29 | physics,spontaneous symmetry breaking 30 | physics,atomic molecular and optical physics 31 | physics,quantum chaos 32 | theology,divine providence 33 | theology,soteriology 34 | theology,anthropology 35 | theology,pneumatology 36 | theology,Christology 37 | theology,Holy Trinity 38 | theology,eschatology 39 | theology,scripture 40 | theology,ecclesiology 41 | theology,predestination 42 | theology,divine degree 43 | theology,creedal confessionalism 44 | theology,scholasticism 45 | theology,prayer 46 | theology,eucharist -------------------------------------------------------------------------------- /shorttext/generators/__init__.py: -------------------------------------------------------------------------------- 1 | from .bow.GensimTopicModeling import load_gensimtopicmodel 2 | from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel 3 | 4 | from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler 5 | from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler 6 | 7 | from .charbase.char2vec import SentenceToCharVecEncoder, initSentenceToCharVecEncoder 8 | from .seq2seq.s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras 9 | from .seq2seq.charbaseS2S import CharBasedSeq2SeqGenerator, loadCharBasedSeq2SeqGenerator 10 | -------------------------------------------------------------------------------- /shorttext/generators/bow/LatentTopicModeling.py: -------------------------------------------------------------------------------- 1 | 2 | from abc import ABC, abstractmethod 3 | 4 | import numpy as np 5 | 6 | from ...utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e 7 | from ...utils.textpreprocessing import tokenize 8 | 9 | # abstract class 10 | class LatentTopicModeler(ABC): 11 | """ 12 | Abstract class for various topic modeler. 13 | """ 14 | def __init__(self, 15 | preprocessor=textpreprocess.standard_text_preprocessor_1(), 16 | normalize=True): 17 | """ Initialize the modeler. 18 | 19 | :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) 20 | :param normalize: whether the retrieved topic vectors are normalized. (Default: True) 21 | :type preprocessor: function 22 | :type normalize: bool 23 | """ 24 | self.preprocessor = preprocessor 25 | self.normalize = normalize 26 | self.trained = False 27 | 28 | def generate_corpus(self, classdict): 29 | """ Calculate the gensim dictionary and corpus, and extract the class labels 30 | from the training data. Called by :func:`~train`. 31 | 32 | :param classdict: training data 33 | :return: None 34 | :type classdict: dict 35 | """ 36 | self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict, 37 | preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent))) 38 | @abstractmethod 39 | def train(self, classdict, nb_topics, *args, **kwargs): 40 | """ Train the modeler. 41 | 42 | This is an abstract method of this abstract class, which raise the `NotImplementedException`. 43 | 44 | :param classdict: training data 45 | :param nb_topics: number of latent topics 46 | :param args: arguments to be passed into the wrapped training functions 47 | :param kwargs: arguments to be passed into the wrapped training functions 48 | :return: None 49 | :raise: NotImplementedException 50 | :type classdict: dict 51 | :type nb_topics: int 52 | """ 53 | self.nb_topics = nb_topics 54 | raise e.NotImplementedException() 55 | 56 | def retrieve_bow(self, shorttext): 57 | """ Calculate the gensim bag-of-words representation of the given short text. 58 | 59 | :param shorttext: text to be represented 60 | :return: corpus representation of the text 61 | :type shorttext: str 62 | :rtype: list 63 | """ 64 | return self.dictionary.doc2bow(tokenize(self.preprocessor(shorttext))) 65 | 66 | def retrieve_bow_vector(self, shorttext, normalize=True): 67 | """ Calculate the vector representation of the bag-of-words in terms of numpy.ndarray. 68 | 69 | :param shorttext: short text 70 | :param normalize: whether the retrieved topic vectors are normalized. (Default: True) 71 | :return: vector represtation of the text 72 | :type shorttext: str 73 | :type normalize: bool 74 | :rtype: numpy.ndarray 75 | """ 76 | bow = self.retrieve_bow(shorttext) 77 | vec = np.zeros(len(self.dictionary)) 78 | for id, val in bow: 79 | vec[id] = val 80 | if normalize: 81 | vec /= np.linalg.norm(vec) 82 | return vec 83 | 84 | @abstractmethod 85 | def retrieve_topicvec(self, shorttext): 86 | """ Calculate the topic vector representation of the short text. 87 | 88 | This is an abstract method of this abstract class, which raise the `NotImplementedException`. 89 | 90 | :param shorttext: short text 91 | :return: topic vector 92 | :raise: NotImplementedException 93 | :type shorttext: str 94 | :rtype: numpy.ndarray 95 | """ 96 | raise e.NotImplementedException() 97 | 98 | @abstractmethod 99 | def get_batch_cos_similarities(self, shorttext): 100 | """ Calculate the cosine similarities of the given short text and all the class labels. 101 | 102 | This is an abstract method of this abstract class, which raise the `NotImplementedException`. 103 | 104 | :param shorttext: short text 105 | :return: topic vector 106 | :raise: NotImplementedException 107 | :type shorttext: str 108 | :rtype: numpy.ndarray 109 | """ 110 | raise e.NotImplementedException() 111 | 112 | def __getitem__(self, shorttext): 113 | return self.retrieve_topicvec(shorttext) 114 | 115 | def __contains__(self, shorttext): 116 | if not self.trained: 117 | raise e.ModelNotTrainedException() 118 | return True 119 | 120 | @abstractmethod 121 | def loadmodel(self, nameprefix): 122 | """ Load the model from files. 123 | 124 | This is an abstract method of this abstract class, which raise the `NotImplementedException`. 125 | 126 | :param nameprefix: prefix of the paths of the model files 127 | :return: None 128 | :raise: NotImplementedException 129 | :type nameprefix: str 130 | """ 131 | raise e.NotImplementedException() 132 | 133 | @abstractmethod 134 | def savemodel(self, nameprefix): 135 | """ Save the model to files. 136 | 137 | This is an abstract method of this abstract class, which raise the `NotImplementedException`. 138 | 139 | :param nameprefix: prefix of the paths of the model files 140 | :return: None 141 | :raise: NotImplementedException 142 | :type nameprefix: str 143 | """ 144 | raise e.NotImplementedException() -------------------------------------------------------------------------------- /shorttext/generators/bow/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import AutoEncodingTopicModeling 3 | from . import GensimTopicModeling 4 | from . import LatentTopicModeling 5 | -------------------------------------------------------------------------------- /shorttext/generators/charbase/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import char2vec 3 | 4 | -------------------------------------------------------------------------------- /shorttext/generators/charbase/char2vec.py: -------------------------------------------------------------------------------- 1 | 2 | from functools import partial 3 | 4 | import numpy as np 5 | from scipy.sparse import csc_matrix 6 | from gensim.corpora import Dictionary 7 | from sklearn.preprocessing import OneHotEncoder 8 | 9 | from ...utils.misc import textfile_generator 10 | 11 | 12 | class SentenceToCharVecEncoder: 13 | """ A class that facilitates one-hot encoding from characters to vectors. 14 | 15 | """ 16 | def __init__(self, dictionary, signalchar='\n'): 17 | """ Initialize the one-hot encoding class. 18 | 19 | :param dictionary: a gensim dictionary 20 | :param signalchar: signal character, useful for seq2seq models (Default: '\n') 21 | :type dictionary: gensim.corpora.Dictionary 22 | :type signalchar: str 23 | """ 24 | self.dictionary = dictionary 25 | self.signalchar = signalchar 26 | numchars = len(self.dictionary) 27 | self.onehot_encoder = OneHotEncoder() 28 | self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1))) 29 | 30 | def calculate_prelim_vec(self, sent): 31 | """ Convert the sentence to a one-hot vector. 32 | 33 | :param sent: sentence 34 | :return: a one-hot vector, with each element the code of that character 35 | :type sent: str 36 | :rtype: numpy.array 37 | """ 38 | return self.onehot_encoder.transform( 39 | np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1)) 40 | ) 41 | 42 | def encode_sentence(self, sent, maxlen, startsig=False, endsig=False): 43 | """ Encode one sentence to a sparse matrix, with each row the expanded vector of each character. 44 | 45 | :param sent: sentence 46 | :param maxlen: maximum length of the sentence 47 | :param startsig: signal character at the beginning of the sentence (Default: False) 48 | :param endsig: signal character at the end of the sentence (Default: False) 49 | :return: matrix representing the sentence 50 | :type sent: str 51 | :type maxlen: int 52 | :type startsig: bool 53 | :type endsig: bool 54 | :rtype: scipy.sparse.csc_matrix 55 | """ 56 | cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '') 57 | sent_vec = self.calculate_prelim_vec(cor_sent).tocsc() 58 | if sent_vec.shape[0] == maxlen + startsig + endsig: 59 | return sent_vec 60 | else: 61 | return csc_matrix((sent_vec.data, sent_vec.indices, sent_vec.indptr), 62 | shape=(maxlen + startsig + endsig, sent_vec.shape[1]), 63 | dtype=np.float64) 64 | 65 | def encode_sentences(self, sentences, maxlen, sparse=True, startsig=False, endsig=False): 66 | """ Encode many sentences into a rank-3 tensor. 67 | 68 | :param sentences: sentences 69 | :param maxlen: maximum length of one sentence 70 | :param sparse: whether to return a sparse matrix (Default: True) 71 | :param startsig: signal character at the beginning of the sentence (Default: False) 72 | :param endsig: signal character at the end of the sentence (Default: False) 73 | :return: rank-3 tensor of the sentences 74 | :type sentences: list 75 | :type maxlen: int 76 | :type sparse: bool 77 | :type startsig: bool 78 | :type endsig: bool 79 | :rtype: scipy.sparse.csc_matrix or numpy.array 80 | """ 81 | encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen) 82 | list_encoded_sentences_map = map(encode_sent_func, sentences) 83 | if sparse: 84 | return list(list_encoded_sentences_map) 85 | else: 86 | return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map]) 87 | 88 | def __len__(self): 89 | return len(self.dictionary) 90 | 91 | 92 | def initSentenceToCharVecEncoder(textfile, encoding=None): 93 | """ Instantiate a class of SentenceToCharVecEncoder from a text file. 94 | 95 | :param textfile: text file 96 | :param encoding: encoding of the text file (Default: None) 97 | :return: an instance of SentenceToCharVecEncoder 98 | :type textfile: file 99 | :type encoding: str 100 | :rtype: SentenceToCharVecEncoder 101 | """ 102 | dictionary = Dictionary(map(lambda line: [c for c in line], textfile_generator(textfile, encoding=encoding))) 103 | return SentenceToCharVecEncoder(dictionary) 104 | -------------------------------------------------------------------------------- /shorttext/generators/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import s2skeras 3 | from . import charbaseS2S 4 | -------------------------------------------------------------------------------- /shorttext/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import dynprog 3 | from . import embedfuzzy 4 | from . import transformers 5 | from . import wasserstein 6 | -------------------------------------------------------------------------------- /shorttext/metrics/dynprog/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import dldist 3 | from . import jaccard 4 | from . import lcp 5 | -------------------------------------------------------------------------------- /shorttext/metrics/dynprog/dldist.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import numba as nb 4 | 5 | 6 | @nb.njit 7 | def damerau_levenshtein(word1: str, word2: str) -> int: 8 | """ Calculate the Demarau-Levenshtein (DL) distance between two words. 9 | 10 | :param word1: first word 11 | :param word2: seccond word 12 | :return: Damerau-Levenshtein (DL) distance 13 | :type word1: str 14 | :type word2: str 15 | :rtype: int 16 | """ 17 | len1 = len(word1) 18 | len2 = len(word2) 19 | matrix = np.zeros((len1+1, len2+1), dtype=np.int8) 20 | 21 | for i in range(len1+1): 22 | matrix[i, 0] = i 23 | for j in range(len2+1): 24 | matrix[0, j] = j 25 | 26 | for i in range(len1+1): 27 | for j in range(len2+1): 28 | cost = 0 29 | if i > 0 and j > 0 and (word1[i-1] != word2[j-1]): 30 | cost = 1 31 | delcost = matrix[i-1, j] + 1 32 | inscost = matrix[i, j-1] + 1 33 | subcost = matrix[i-1, j-1] + cost 34 | score = min(min(delcost, inscost), subcost) 35 | if ((i > 1) & (j > 1) & (word1[i - 1] == word2[j - 2]) & (word1[i - 2] == word2[j - 1])): 36 | score = min(score, matrix[i-2, j-2] + cost) 37 | matrix[i, j] = score 38 | 39 | print(matrix) 40 | 41 | return matrix[len1, len2] 42 | -------------------------------------------------------------------------------- /shorttext/metrics/dynprog/jaccard.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import product 3 | 4 | from .dldist import damerau_levenshtein 5 | from .lcp import longest_common_prefix 6 | 7 | 8 | def similarity(word1, word2): 9 | """ Return the similarity between the two words. 10 | 11 | Return the similarity between the two words, between 0 and 1 inclusively. 12 | The similarity is the maximum of the two values: 13 | - 1 - Damerau-Levenshtein distance between two words / maximum length of the two words 14 | - longest common prefix of the two words / maximum length of the two words 15 | 16 | Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE 17 | `_] 18 | 19 | :param word1: a word 20 | :param word2: a word 21 | :return: similarity, between 0 and 1 inclusively 22 | :type word1: str 23 | :type word2: str 24 | :rtype: float 25 | """ 26 | maxlen = max(len(word1), len(word2)) 27 | editdistance = damerau_levenshtein(word1, word2) 28 | lcp = longest_common_prefix(word1, word2) 29 | return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen) 30 | 31 | 32 | def soft_intersection_list(tokens1, tokens2): 33 | """ Return the soft number of intersections between two lists of tokens. 34 | 35 | :param tokens1: list of tokens. 36 | :param tokens2: list of tokens. 37 | :return: soft number of intersections. 38 | :type tokens1: list 39 | :type tokens2: list 40 | :rtype: float 41 | """ 42 | intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)] 43 | intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True) 44 | 45 | included_list = set() 46 | used_tokens1 = set() 47 | used_tokens2 = set() 48 | for (token1, token2), sim in intersected_list: 49 | if (not (token1 in used_tokens1)) and (not (token2 in used_tokens2)): 50 | included_list.add(((token1, token2), sim)) 51 | used_tokens1.add(token1) 52 | used_tokens2.add(token2) 53 | 54 | return included_list 55 | 56 | 57 | def soft_jaccard_score(tokens1, tokens2): 58 | """ Return the soft Jaccard score of the two lists of tokens, between 0 and 1 inclusively. 59 | 60 | Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE 61 | `_] 62 | 63 | :param tokens1: list of tokens. 64 | :param tokens2: list of tokens. 65 | :return: soft Jaccard score, between 0 and 1 inclusively. 66 | :type tokens1: list 67 | :type tokens2: list 68 | :rtype: float 69 | """ 70 | intersection_list = soft_intersection_list(tokens1, tokens2) 71 | num_intersections = sum([item[1] for item in intersection_list]) 72 | num_unions = len(tokens1) + len(tokens2) - num_intersections 73 | return float(num_intersections)/float(num_unions) 74 | -------------------------------------------------------------------------------- /shorttext/metrics/dynprog/lcp.py: -------------------------------------------------------------------------------- 1 | 2 | import numba as nb 3 | 4 | 5 | @nb.njit 6 | def longest_common_prefix(word1: str, word2: str) -> int: 7 | """ Calculate the longest common prefix (LCP) between two words. 8 | 9 | :param word1: first word 10 | :param word2: seccond word 11 | :return: longest common prefix (LCP) 12 | :type word1: str 13 | :type word2: str 14 | :rtype: int 15 | """ 16 | lcp = 0 17 | for i in range(min(len(word1), len(word2))): 18 | if word1[i] == word2[i]: 19 | lcp += 1 20 | else: 21 | break 22 | return lcp 23 | -------------------------------------------------------------------------------- /shorttext/metrics/embedfuzzy/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .jaccard import jaccardscore_sents -------------------------------------------------------------------------------- /shorttext/metrics/embedfuzzy/jaccard.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import product 3 | 4 | import numpy as np 5 | from scipy.spatial.distance import cosine 6 | 7 | from ...utils import tokenize 8 | 9 | 10 | def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)): 11 | """ Compute the Jaccard score between sentences based on their word similarities. 12 | 13 | :param sent1: first sentence 14 | :param sent2: second sentence 15 | :param wvmodel: word-embeding model 16 | :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine) 17 | :return: soft Jaccard score 18 | :type sent1: str 19 | :type sent2: str 20 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 21 | :type sim_words: function 22 | :rtype: float 23 | """ 24 | tokens1 = tokenize(sent1) 25 | tokens2 = tokenize(sent2) 26 | tokens1 = list(filter(lambda w: w in wvmodel, tokens1)) 27 | tokens2 = list(filter(lambda w: w in wvmodel, tokens2)) 28 | allowable1 = [True] * len(tokens1) 29 | allowable2 = [True] * len(tokens2) 30 | 31 | simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]]) 32 | for i, j in product(range(len(tokens1)), range(len(tokens2)))} 33 | 34 | intersection = 0.0 35 | simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) 36 | for idxtuple, sim in simdictitems: 37 | i, j = idxtuple 38 | if allowable1[i] and allowable2[j]: 39 | intersection += sim 40 | allowable1[i] = False 41 | allowable2[j] = False 42 | 43 | union = len(tokens1) + len(tokens2) - intersection 44 | 45 | if union > 0: 46 | return intersection / union 47 | elif intersection == 0: 48 | return 1. 49 | else: 50 | return np.inf 51 | -------------------------------------------------------------------------------- /shorttext/metrics/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .bertscore import BERTScorer 3 | -------------------------------------------------------------------------------- /shorttext/metrics/transformers/bertscore.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import product 3 | 4 | import numpy as np 5 | import torch 6 | from ...utils.transformers import WrappedBERTEncoder 7 | 8 | 9 | class BERTScorer: 10 | """ This is the class that compute the BERTScores between sentences. BERTScores 11 | include recall BERTScores, precision BERTScores, and F1 BERTSscores. 12 | For more information, please refer to this paper: 13 | 14 | Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, Yoav Artzi, 15 | "BERTScore: Evaluating Text Generation with BERT," arXiv:1904.09675 (2019). [`arXiv 16 | `_] 17 | 18 | """ 19 | def __init__( 20 | self, 21 | model=None, 22 | tokenizer=None, 23 | max_length=48, 24 | nbencodinglayers=4, 25 | device='cpu' 26 | ): 27 | """ It is the class that compute the BERTScores between sentences. 28 | 29 | :param model: BERT model (default: None, with model `bert-base-uncase` to be used) 30 | :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used) 31 | :param max_length: maximum number of tokens of each sentence (default: 48) 32 | :param nbencodinglayers: number of encoding layers (taking the last layers to encode the sentences, default: 4) 33 | :param device: device the language model is stored (default: `cpu`) 34 | :type model: str 35 | :type tokenizer: str 36 | :type max_length: int 37 | :type device: str 38 | """ 39 | self.encoder = WrappedBERTEncoder( 40 | model=model, 41 | tokenizer=tokenizer, 42 | max_length=max_length, 43 | nbencodinglayers=nbencodinglayers, 44 | device=device) 45 | self.device = self.encoder.device 46 | self.cosine_fcn = torch.nn.CosineSimilarity(dim=0).to(self.device) 47 | 48 | def compute_matrix(self, sentence_a, sentence_b): 49 | """ Compute the table of similarities between all pairs of tokens. This is used 50 | for calculating the BERTScores. 51 | 52 | :param sentence_a: first sentence 53 | :param sentence_b: second sentence 54 | :return: similarity matrix of between tokens in two sentences 55 | :type sentence_a: str 56 | :type sentence_b: str 57 | :rtype: numpy.ndarray 58 | """ 59 | cos = self.cosine_fcn 60 | _, sentence_a_tokens_embeddings, sentence_a_tokens = self.encoder.encode_sentences([sentence_a]) 61 | _, sentence_b_tokens_embeddings, sentence_b_tokens = self.encoder.encode_sentences([sentence_b]) 62 | 63 | similarity_matrix = torch.zeros((len(sentence_a_tokens[0])-2, len(sentence_b_tokens[0])-2), 64 | device=self.device) 65 | 66 | for i, j in product(range(len(sentence_a_tokens[0])-2), range(len(sentence_b_tokens[0])-2)): 67 | similarity_matrix[i, j] = cos(sentence_a_tokens_embeddings[0][i+1], 68 | sentence_b_tokens_embeddings[0][j+1]) 69 | 70 | return similarity_matrix 71 | 72 | def recall_bertscore(self, reference_sentence, test_sentence): 73 | """ Compute the recall BERTScore between two sentences. 74 | 75 | :param reference_sentence: reference sentence 76 | :param test_sentence: test sentence 77 | :return: recall BERTScore between the two sentences 78 | :type reference_sentence: str 79 | :type test_sentence: str 80 | :rtype: float 81 | """ 82 | similarity_matrix = self.compute_matrix(reference_sentence, test_sentence) 83 | recall = torch.mean(torch.max(similarity_matrix, axis=1).values) 84 | return np.float(recall.detach().numpy()) 85 | 86 | def precision_bertscore(self, reference_sentence, test_sentence): 87 | """ Compute the precision BERTScore between two sentences. 88 | 89 | :param reference_sentence: reference sentence 90 | :param test_sentence: test sentence 91 | :return: precision BERTScore between the two sentences 92 | :type reference_sentence: str 93 | :type test_sentence: str 94 | :rtype: float 95 | """ 96 | similarity_matrix = self.compute_matrix(reference_sentence, test_sentence) 97 | precision = torch.mean(torch.max(similarity_matrix, axis=0).values) 98 | return np.float(precision.detach().numpy()) 99 | 100 | def f1score_bertscore(self, reference_sentence, test_sentence): 101 | """ Compute the F1 BERTScore between two sentences. 102 | 103 | :param reference_sentence: reference sentence 104 | :param test_sentence: test sentence 105 | :return: F1 BERTScore between the two sentences 106 | :type reference_sentence: str 107 | :type test_sentence: str 108 | :rtype: float 109 | """ 110 | recall = self.recall_bertscore(reference_sentence, test_sentence) 111 | precision = self.precision_bertscore(reference_sentence, test_sentence) 112 | return 2*recall*precision/(recall+precision) 113 | -------------------------------------------------------------------------------- /shorttext/metrics/wasserstein/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .wordmoverdist import word_mover_distance_linprog, word_mover_distance -------------------------------------------------------------------------------- /shorttext/metrics/wasserstein/wordmoverdist.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import product 3 | import warnings 4 | 5 | import numpy as np 6 | from scipy.spatial.distance import euclidean 7 | from scipy.sparse import csr_matrix 8 | from scipy.optimize import linprog 9 | 10 | from ...utils.gensim_corpora import tokens_to_fracdict 11 | 12 | 13 | def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): 14 | """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class. 15 | 16 | Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding 17 | model has to be provided. The whole `scipy.optimize.Optimize` object is returned. 18 | 19 | Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). 20 | 21 | :param first_sent_tokens: first list of tokens. 22 | :param second_sent_tokens: second list of tokens. 23 | :param wvmodel: word-embedding models. 24 | :param distancefunc: distance function that takes two numpy ndarray. 25 | :return: the whole result of the linear programming problem 26 | :type first_sent_tokens: list 27 | :type second_sent_tokens: list 28 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 29 | :type distancefunc: function 30 | :rtype: scipy.optimize.OptimizeResult 31 | """ 32 | nb_tokens_first_sent = len(first_sent_tokens) 33 | nb_tokens_second_sent = len(second_sent_tokens) 34 | 35 | all_tokens = list(set(first_sent_tokens+second_sent_tokens)) 36 | wordvecs = {token: wvmodel[token] for token in all_tokens} 37 | 38 | first_sent_buckets = tokens_to_fracdict(first_sent_tokens) 39 | second_sent_buckets = tokens_to_fracdict(second_sent_tokens) 40 | 41 | collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j 42 | 43 | # assigning T 44 | T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent) 45 | for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): 46 | T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], 47 | wordvecs[second_sent_tokens[j]]) 48 | 49 | # assigning Aeq and beq 50 | Aeq = csr_matrix( 51 | (nb_tokens_first_sent+nb_tokens_second_sent, 52 | nb_tokens_first_sent*nb_tokens_second_sent) 53 | ) 54 | beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent) 55 | for i in range(nb_tokens_first_sent): 56 | for j in range(nb_tokens_second_sent): 57 | Aeq[i, collapsed_idx_func(i, j)] = 1. 58 | beq[i] = first_sent_buckets[first_sent_tokens[i]] 59 | for j in range(nb_tokens_second_sent): 60 | for i in range(nb_tokens_first_sent): 61 | Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. 62 | beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]] 63 | 64 | return linprog(T, A_eq=Aeq, b_eq=beq) 65 | 66 | 67 | def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): 68 | """ Compute the Word Mover's distance (WMD) between the two given lists of tokens. 69 | 70 | Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding 71 | model has to be provided. WMD is returned. 72 | 73 | Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). 74 | 75 | :param first_sent_tokens: first list of tokens. 76 | :param second_sent_tokens: second list of tokens. 77 | :param wvmodel: word-embedding models. 78 | :param distancefunc: distance function that takes two numpy ndarray. 79 | :param lpFile: deprecated, kept for backward incompatibility. (default: None) 80 | :return: Word Mover's distance (WMD) 81 | :type first_sent_tokens: list 82 | :type second_sent_tokens: list 83 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 84 | :type distancefunc: function 85 | :type lpFile: str 86 | :rtype: float 87 | """ 88 | linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, 89 | distancefunc=distancefunc) 90 | if lpFile is not None: 91 | warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \ 92 | 'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \ 93 | 'this parameter.') 94 | return linprog_result['fun'] 95 | -------------------------------------------------------------------------------- /shorttext/smartload.py: -------------------------------------------------------------------------------- 1 | 2 | from .utils import standard_text_preprocessor_1 3 | from .utils import compactmodel_io as cio 4 | from .utils import classification_exceptions as e 5 | from .utils import load_DocumentTermMatrix 6 | from .classifiers import load_varnnlibvec_classifier, load_sumword2vec_classifier 7 | from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel 8 | from .generators import loadSeq2SeqWithKeras, loadCharBasedSeq2SeqGenerator 9 | from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier 10 | from .classifiers import load_maxent_classifier 11 | from .spell import loadSCRNNSpellCorrector 12 | 13 | 14 | def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None): 15 | """ Load appropriate classifier or model from the binary model. 16 | 17 | The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed. 18 | 19 | :param filename: path of the compact model file 20 | :param wvmodel: Word2Vec model 21 | :param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) 22 | :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) 23 | :return: appropriate classifier or model 24 | :raise: AlgorithmNotExistException 25 | :type filename: str 26 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 27 | :type preprocessor: function 28 | :type vecsize: int 29 | """ 30 | classifier_name = cio.get_model_classifier_name(filename) 31 | if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']: 32 | return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True) 33 | elif classifier_name in ['kerasautoencoder']: 34 | return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True) 35 | elif classifier_name in ['topic_sklearn']: 36 | topicmodel = cio.get_model_config_field(filename, 'topicmodel') 37 | if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: 38 | return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) 39 | elif topicmodel in ['kerasautoencoder']: 40 | return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) 41 | else: 42 | raise e.AlgorithmNotExistException(topicmodel) 43 | elif classifier_name in ['nnlibvec']: 44 | return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) 45 | elif classifier_name in ['sumvec']: 46 | return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) 47 | elif classifier_name in ['maxent']: 48 | return load_maxent_classifier(filename, compact=True) 49 | elif classifier_name in ['dtm']: 50 | return load_DocumentTermMatrix(filename, compact=True) 51 | elif classifier_name in ['kerasseq2seq']: 52 | return loadSeq2SeqWithKeras(filename, compact=True) 53 | elif classifier_name in ['charbases2s']: 54 | return loadCharBasedSeq2SeqGenerator(filename, compact=True) 55 | elif classifier_name in ['scrnn_spell']: 56 | return loadSCRNNSpellCorrector(filename, compact=True) 57 | else: 58 | raise e.AlgorithmNotExistException(classifier_name) -------------------------------------------------------------------------------- /shorttext/spell/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .basespellcorrector import SpellCorrector 3 | 4 | from .norvig import NorvigSpellCorrector 5 | from .sakaguchi import SCRNNSpellCorrector, loadSCRNNSpellCorrector 6 | 7 | -------------------------------------------------------------------------------- /shorttext/spell/basespellcorrector.py: -------------------------------------------------------------------------------- 1 | 2 | from abc import ABC, abstractmethod 3 | 4 | from ..utils.classification_exceptions import NotImplementedException 5 | 6 | 7 | class SpellCorrector(ABC): 8 | """ Base class for all spell corrector. 9 | 10 | This class is not implemented; this is an "abstract class." 11 | 12 | """ 13 | @abstractmethod 14 | def train(self, text): 15 | """ Train the spell corrector with the given corpus. 16 | 17 | :param text: training corpus 18 | :type text: str 19 | """ 20 | raise NotImplementedException() 21 | 22 | @abstractmethod 23 | def correct(self, word): 24 | """ Recommend a spell correction to given the word. 25 | 26 | :param word: word to be checked 27 | :return: recommended correction 28 | :type word: str 29 | :rtype: str 30 | """ 31 | return word 32 | -------------------------------------------------------------------------------- /shorttext/spell/editor.py: -------------------------------------------------------------------------------- 1 | 2 | import numba as nb 3 | 4 | 5 | @nb.njit 6 | def compute_set_edits1(word): 7 | letters = 'abcdefghijklmnopqrstuvwxyz' 8 | 9 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 10 | deletes = [L + R[1:] for L, R in splits if R] 11 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] 12 | replaces = [L + c + R[1:] for L, R in splits if R for c in letters] 13 | inserts = [L + c + R for L, R in splits for c in letters] 14 | 15 | returned_set = set(deletes + transposes + replaces + inserts) 16 | 17 | return returned_set 18 | 19 | 20 | @nb.njit 21 | def compute_set_edits2(word): 22 | return (e2 for e1 in compute_set_edits1(word) for e2 in compute_set_edits1(e1)) 23 | -------------------------------------------------------------------------------- /shorttext/spell/norvig.py: -------------------------------------------------------------------------------- 1 | 2 | # reference: https://norvig.com/spell-correct.html 3 | 4 | import re 5 | from collections import Counter 6 | 7 | from . import SpellCorrector 8 | from .editor import compute_set_edits1, compute_set_edits2 9 | 10 | 11 | class NorvigSpellCorrector(SpellCorrector): 12 | """ Spell corrector described by Peter Norvig in his blog. (https://norvig.com/spell-correct.html) 13 | 14 | """ 15 | def __init__(self): 16 | """ Instantiate the class 17 | 18 | """ 19 | self.train('') 20 | 21 | def train(self, text): 22 | """ Given the text, train the spell corrector. 23 | 24 | :param text: training corpus 25 | :type text: str 26 | """ 27 | self.words = re.findall('\\w+', text.lower()) 28 | self.WORDS = Counter(self.words) 29 | self.N = sum(self.WORDS.values()) 30 | 31 | def P(self, word): 32 | """ Compute the probability of the words randomly sampled from the training corpus. 33 | 34 | :param word: a word 35 | :return: probability of the word sampled randomly in the corpus 36 | :type word: str 37 | :rtype: float 38 | """ 39 | return self.WORDS[word] / float(self.N) 40 | 41 | def correct(self, word): 42 | """ Recommend a spelling correction to the given word 43 | 44 | :param word: a word 45 | :return: recommended correction 46 | :type word: str 47 | :rtype: str 48 | """ 49 | return max(self.candidates(word), key=self.P) 50 | 51 | def known(self, words): 52 | """ Filter away the words that are not found in the training corpus. 53 | 54 | :param words: list of words 55 | :return: list of words that can be found in the training corpus 56 | :type words: list 57 | :rtype: list 58 | """ 59 | return set(w for w in words if w in self.WORDS) 60 | 61 | def candidates(self, word): 62 | """ List potential candidates for corrected spelling to the given words. 63 | 64 | :param word: a word 65 | :return: list of recommended corrections 66 | :type word: str 67 | :rtype: list 68 | """ 69 | return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word]) 70 | 71 | -------------------------------------------------------------------------------- /shorttext/stack/__init__.py: -------------------------------------------------------------------------------- 1 | from .stacking import StackedGeneralization, LogisticStackedGeneralization -------------------------------------------------------------------------------- /shorttext/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import misc 3 | from . import kerasmodel_io 4 | from . import classification_exceptions 5 | from . import gensim_corpora 6 | from . import textpreprocessing 7 | from . import compactmodel_io 8 | from . import dtm 9 | 10 | from .textpreprocessing import tokenize, stemword 11 | from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1, standard_text_preprocessor_2 12 | 13 | from .wordembed import load_word2vec_model, load_fasttext_model, load_poincare_model, shorttext_to_avgvec 14 | from .wordembed import RESTfulKeyedVectors 15 | from .dtm import load_DocumentTermMatrix 16 | 17 | from .dtm import DocumentTermMatrix, load_DocumentTermMatrix 18 | 19 | from .transformers import WrappedBERTEncoder 20 | 21 | -------------------------------------------------------------------------------- /shorttext/utils/classification_exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | class ModelNotTrainedException(Exception): 6 | def __init__(self): 7 | self.message = 'Model not trained.' 8 | 9 | 10 | class AlgorithmNotExistException(Exception): 11 | def __init__(self, algoname): 12 | self.message = 'Algorithm '+algoname+' not exist.' 13 | 14 | 15 | class WordEmbeddingModelNotExistException(Exception): 16 | def __init__(self, path): 17 | self.message = 'Given path of the word-embedding model not exist: '+path 18 | 19 | 20 | class UnequalArrayLengthsException(Exception): 21 | def __init__(self, arr1, arr2): 22 | self.message = 'Unequal lengths: '+str(len(arr1))+" and "+str(len(arr2)) 23 | 24 | 25 | class NotImplementedException(Exception): 26 | def __init__(self): 27 | self.message = 'Method not implemented.' 28 | 29 | 30 | class IncorrectClassificationModelFileException(Exception): 31 | def __init__(self, expectedname, actualname): 32 | self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')' 33 | 34 | 35 | class OperationNotDefinedException(Exception): 36 | def __init__(self, opname): 37 | self.message = 'Operation '+opname+' not defined' 38 | -------------------------------------------------------------------------------- /shorttext/utils/dtm.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from gensim.corpora import Dictionary 4 | from gensim.models import TfidfModel 5 | from scipy.sparse import dok_matrix 6 | 7 | import pickle 8 | 9 | from .compactmodel_io import CompactIOMachine 10 | from .classification_exceptions import NotImplementedException 11 | 12 | 13 | dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl'] 14 | 15 | class DocumentTermMatrix(CompactIOMachine): 16 | """ Document-term matrix for corpus. 17 | 18 | This is a class that handles the document-term matrix (DTM). With a given corpus, users can 19 | retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf 20 | can be applied. 21 | """ 22 | def __init__(self, corpus, docids=None, tfidf=False): 23 | """ Initialize the document-term matrix (DTM) class with a given corpus. 24 | 25 | If document IDs (docids) are given, it will be stored and output as approrpriate. 26 | If not, the documents are indexed by numbers. 27 | 28 | Users can choose to weigh by tf-idf. The default is not to weigh. 29 | 30 | The corpus has to be a list of lists, with each of the inside list contains all the tokens 31 | in each document. 32 | 33 | :param corpus: corpus. 34 | :param docids: list of designated document IDs. (Default: None) 35 | :param tfidf: whether to weigh using tf-idf. (Default: False) 36 | :type corpus: list 37 | :type docids: list 38 | :type tfidf: bool 39 | """ 40 | CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices) 41 | if docids == None: 42 | self.docid_dict = {i: i for i in range(len(corpus))} 43 | self.docids = range(len(corpus)) 44 | else: 45 | if len(docids) == len(corpus): 46 | self.docid_dict = {docid: i for i, docid in enumerate(docids)} 47 | self.docids = docids 48 | elif len(docids) > len(corpus): 49 | self.docid_dict = {docid: i for i, docid in zip(range(len(corpus)), docids[:len(corpus)])} 50 | self.docids = docids[:len(corpus)] 51 | else: 52 | self.docid_dict = {docid: i for i, docid in enumerate(docids)} 53 | self.docid_dict = {i: i for i in range(len(docids), range(corpus))} 54 | self.docids = docids + range(len(docids), range(corpus)) 55 | # generate DTM 56 | self.generate_dtm(corpus, tfidf=tfidf) 57 | 58 | def generate_dtm(self, corpus, tfidf=False): 59 | """ Generate the inside document-term matrix and other peripherical information 60 | objects. This is run when the class is instantiated. 61 | 62 | :param corpus: corpus. 63 | :param tfidf: whether to weigh using tf-idf. (Default: False) 64 | :return: None 65 | :type corpus: list 66 | :type tfidf: bool 67 | """ 68 | self.dictionary = Dictionary(corpus) 69 | self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float_) 70 | bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus] 71 | if tfidf: 72 | weighted_model = TfidfModel(bow_corpus) 73 | bow_corpus = weighted_model[bow_corpus] 74 | for docid in self.docids: 75 | for tokenid, count in bow_corpus[self.docid_dict[docid]]: 76 | self.dtm[self.docid_dict[docid], tokenid] = count 77 | 78 | def get_termfreq(self, docid, token): 79 | """ Retrieve the term frequency of a given token in a particular document. 80 | 81 | Given a token and a particular document ID, compute the term frequency for this 82 | token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted 83 | term frequency. 84 | 85 | :param docid: document ID 86 | :param token: term or token 87 | :return: term frequency or weighted term frequency of the given token in this document (designated by docid) 88 | :type docid: any 89 | :type token: str 90 | :rtype: numpy.float 91 | """ 92 | return self.dtm[self.docid_dict[docid], self.dictionary.token2id[token]] 93 | 94 | def get_total_termfreq(self, token): 95 | """ Retrieve the total occurrences of the given token. 96 | 97 | Compute the total occurrences of the term in all documents. If `tfidf` is set to `True` 98 | while instantiating the class, it returns the sum of weighted term frequency. 99 | 100 | :param token: term or token 101 | :return: total occurrences of the given token 102 | :type token: str 103 | :rtype: numpy.float 104 | """ 105 | return sum(self.dtm[:, self.dictionary.token2id[token]].values()) 106 | 107 | def get_doc_frequency(self, token): 108 | """ Retrieve the document frequency of the given token. 109 | 110 | Compute the document frequency of the given token, i.e., the number of documents 111 | that this token can be found. 112 | 113 | :param token: term or token 114 | :return: document frequency of the given token 115 | :type token: str 116 | :rtype: int 117 | """ 118 | return len(self.dtm[:, self.dictionary.token2id[token]].values()) 119 | 120 | def get_token_occurences(self, token): 121 | """ Retrieve the term frequencies of a given token in all documents. 122 | 123 | Compute the term frequencies of the given token for all the documents. If `tfidf` is 124 | set to be `True` while instantiating the class, it returns the weighted term frequencies. 125 | 126 | This method returns a dictionary of term frequencies with the corresponding document IDs 127 | as the keys. 128 | 129 | :param token: term or token 130 | :return: a dictionary of term frequencies with the corresponding document IDs as the keys 131 | :type token: str 132 | :rtype: dict 133 | """ 134 | return {self.docids[docidx]: count for (docidx, _), count in self.dtm[:, self.dictionary.token2id[token]].items()} 135 | 136 | def get_doc_tokens(self, docid): 137 | """ Retrieve the term frequencies of all tokens in the given document. 138 | 139 | Compute the term frequencies of all tokens for the given document. If `tfidf` is 140 | set to be `True` while instantiating the class, it returns the weighted term frequencies. 141 | 142 | This method returns a dictionary of term frequencies with the tokens as the keys. 143 | 144 | :param docid: document ID 145 | :return: a dictionary of term frequencies with the tokens as the keys 146 | :type docid: any 147 | :rtype: dict 148 | """ 149 | return {self.dictionary[tokenid]: count for (_, tokenid), count in self.dtm[self.docid_dict[docid], :].items()} 150 | 151 | def generate_dtm_dataframe(self): 152 | """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3) 153 | 154 | Now it raises exception. 155 | 156 | :return: data frame of the document-term matrix 157 | :rtype: pandas.DataFrame 158 | :raise: NotImplementedException 159 | """ 160 | raise NotImplementedException() 161 | 162 | def savemodel(self, prefix): 163 | """ Save the model. 164 | 165 | :param prefix: prefix of the files 166 | :return: None 167 | :type prefix: str 168 | """ 169 | pickle.dump(self.docids, open(prefix+'_docids.pkl', 'wb')) 170 | self.dictionary.save(prefix+'_dictionary.dict') 171 | pickle.dump(self.dtm, open(prefix+'_dtm.pkl', 'wb')) 172 | 173 | def loadmodel(self, prefix): 174 | """ Load the model. 175 | 176 | :param prefix: prefix of the files 177 | :return: None 178 | :type prefix: str 179 | """ 180 | self.docids = pickle.load(open(prefix+'_docids.pkl', 'rb')) 181 | self.docid_dict = {docid: i for i, docid in enumerate(self.docids)} 182 | self.dictionary = Dictionary.load(prefix+'_dictionary.dict') 183 | self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb')) 184 | 185 | 186 | def load_DocumentTermMatrix(filename, compact=True): 187 | """ Load presaved Document-Term Matrix (DTM). 188 | 189 | Given the file name (if `compact` is `True`) or the prefix (if `compact` is `False`), 190 | return the document-term matrix. 191 | 192 | :param filename: file name or prefix 193 | :param compact: whether it is a compact model. (Default: `True`) 194 | :return: document-term matrix 195 | :type filename: str 196 | :type compact: bool 197 | :rtype: DocumentTermMatrix 198 | """ 199 | dtm = DocumentTermMatrix([[]]) 200 | if compact: 201 | dtm.load_compact_model(filename) 202 | else: 203 | dtm.loadmodel(filename) 204 | return dtm -------------------------------------------------------------------------------- /shorttext/utils/gensim_corpora.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import defaultdict 3 | 4 | import gensim 5 | 6 | from .textpreprocessing import tokenize 7 | 8 | 9 | def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize): 10 | """ Generate gensim bag-of-words dictionary and corpus. 11 | 12 | Given a text data, a dict with keys being the class labels, and the values 13 | being the list of short texts, in the same format output by `shorttext.data.data_retrieval`, 14 | return a gensim dictionary and corpus. 15 | 16 | :param classdict: text data, a dict with keys being the class labels, and each value is a list of short texts 17 | :param proprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) 18 | :return: a tuple, consisting of a gensim dictionary, a corpus, and a list of class labels 19 | :type classdict: dict 20 | :type proprocess_and_tokenize: function 21 | :rtype: (gensim.corpora.Dictionary, list, list) 22 | """ 23 | classlabels = sorted(classdict.keys()) 24 | doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels] 25 | dictionary = gensim.corpora.Dictionary(doc) 26 | corpus = [dictionary.doc2bow(doctokens) for doctokens in doc] 27 | return dictionary, corpus, classlabels 28 | 29 | 30 | def save_corpus(dictionary, corpus, prefix): 31 | """ Save gensim corpus and dictionary. 32 | 33 | :param dictionary: dictionary to save 34 | :param corpus: corpus to save 35 | :param prefix: prefix of the files to save 36 | :return: None 37 | :type dictionary: gensim.corpora.Dictionary 38 | :type corpus: list 39 | :type prefix: str 40 | """ 41 | dictionary.save(prefix+'_dictionary.dict') 42 | gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus) 43 | 44 | 45 | def load_corpus(prefix): 46 | """ Load gensim corpus and dictionary. 47 | 48 | :param prefix: prefix of the file to load 49 | :return: corpus and dictionary 50 | :type prefix: str 51 | :rtype: tuple 52 | """ 53 | corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm') 54 | dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') 55 | return corpus, dictionary 56 | 57 | 58 | def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokenize=tokenize): 59 | """ Update corpus with additional training data. 60 | 61 | With the additional training data, the dictionary and corpus are updated. 62 | 63 | :param dictionary: original dictionary 64 | :param corpus: original corpus 65 | :param newclassdict: additional training data 66 | :param preprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) 67 | :return: a tuple, an updated corpus, and the new corpus (for updating model) 68 | :type dictionary: gensim.corpora.Dictionary 69 | :type corpus: list 70 | :type newclassdict: dict 71 | :type preprocess_and_tokenize: function 72 | :rtype: tuple 73 | """ 74 | 75 | newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())] 76 | newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc] 77 | corpus += newcorpus 78 | 79 | return corpus, newcorpus 80 | 81 | 82 | def tokens_to_fracdict(tokens): 83 | """ Return normalized bag-of-words (BOW) vectors. 84 | 85 | :param tokens: list of tokens. 86 | :type tokens: list 87 | :return: normalized vectors of counts of tokens as a `dict` 88 | :rtype: dict 89 | """ 90 | cntdict = defaultdict(lambda : 0) 91 | for token in tokens: 92 | cntdict[token] += 1 93 | totalcnt = sum(cntdict.values()) 94 | return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()} -------------------------------------------------------------------------------- /shorttext/utils/kerasmodel_io.py: -------------------------------------------------------------------------------- 1 | 2 | from tensorflow.keras.models import model_from_json 3 | 4 | 5 | def save_model(nameprefix, model): 6 | """ Save a keras sequential model into files. 7 | 8 | Given a keras sequential model, save the model with the given file path prefix. 9 | It saves the model into a JSON file, and an HDF5 file (.h5). 10 | 11 | :param nameprefix: Prefix of the paths of the model files 12 | :param model: keras sequential model to be saved 13 | :return: None 14 | :type nameprefix: str 15 | :type model: keras.models.Model 16 | """ 17 | model_json = model.to_json() 18 | open(nameprefix+'.json', 'w').write(model_json) 19 | model.save_weights(nameprefix+'.weights.h5') 20 | 21 | 22 | def load_model(nameprefix): 23 | """ Load a keras sequential model from files. 24 | 25 | Given the prefix of the file paths, load a keras sequential model from 26 | a JSON file and an HDF5 file. 27 | 28 | :param nameprefix: Prefix of the paths of the model files 29 | :return: keras sequential model 30 | :type nameprefix: str 31 | :rtype: keras.models.Model 32 | """ 33 | model = model_from_json(open(nameprefix+'.json', 'r').read()) 34 | model.load_weights(nameprefix+'.weights.h5') 35 | return model -------------------------------------------------------------------------------- /shorttext/utils/misc.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def textfile_generator(textfile, linebreak=True, encoding=None): 4 | """ Return a generator that reads lines in a text file. 5 | 6 | :param textfile: file object of a text file 7 | :param linebreak: whether to return a line break at the end of each line (Default: True) 8 | :param encoding: encoding of the text file (Default: None) 9 | :return: a generator that reads lines in a text file 10 | :type textfile: file 11 | :type linebreak: bool 12 | :type encoding: str 13 | :rtype: generator 14 | """ 15 | for t in textfile: 16 | if len(t) > 0: 17 | if encoding is None: 18 | yield t.strip() + ('\n' if linebreak else '') 19 | else: 20 | yield t.decode(encoding).strip() + ('\n' if linebreak else '') 21 | 22 | 23 | class SinglePoolExecutor: 24 | """ It is a wrapper for Python `map` functions. 25 | 26 | """ 27 | def map(self, func, *iterables): 28 | """ Refer to Python `map` documentation. 29 | 30 | :param func: function 31 | :param iterables: iterables to loop 32 | :return: generator for the map 33 | :type func: function 34 | :type iterables: iterables 35 | :rtype: map 36 | """ 37 | return map(func, *iterables) 38 | -------------------------------------------------------------------------------- /shorttext/utils/textpreprocessing.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import os 4 | import codecs 5 | 6 | import snowballstemmer 7 | 8 | # tokenizer 9 | def tokenize(s: str) -> list[str]: 10 | return s.split(' ') 11 | 12 | 13 | # stemmer 14 | class StemmerSingleton: 15 | def __new__(cls): 16 | if not hasattr(cls, 'instance'): 17 | cls.instance = super(StemmerSingleton, cls).__new__(cls) 18 | cls.stemmer = snowballstemmer.stemmer('english') 19 | return cls.instance 20 | 21 | def __call__(cls, s: str) -> str: 22 | return cls.stemmer.stemWord(s) 23 | 24 | def stemword(s: str) -> str: 25 | return StemmerSingleton()(s) 26 | 27 | 28 | def preprocess_text(text, pipeline): 29 | """ Preprocess the text according to the given pipeline. 30 | 31 | Given the pipeline, which is a list of functions that process an 32 | input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), 33 | preprocess the text. 34 | 35 | :param text: text to be preprocessed 36 | :param pipeline: a list of functions that convert a text to another text 37 | :return: preprocessed text 38 | :type text: str 39 | :type pipeline: list 40 | :rtype: str 41 | """ 42 | return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:]) 43 | 44 | 45 | def text_preprocessor(pipeline): 46 | """ Return the function that preprocesses text according to the pipeline. 47 | 48 | Given the pipeline, which is a list of functions that process an 49 | input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), 50 | return a function that preprocesses an input text outlined by the pipeline, essentially 51 | a function that runs :func:`~preprocess_text` with the specified pipeline. 52 | 53 | :param pipeline: a list of functions that convert a text to another text 54 | :return: a function that preprocesses text according to the pipeline 55 | :type pipeline: list 56 | :rtype: function 57 | """ 58 | return lambda text: preprocess_text(text, pipeline) 59 | 60 | 61 | def oldschool_standard_text_preprocessor(stopwordsfile): 62 | """ Return a commonly used text preprocessor. 63 | 64 | Return a text preprocessor that is commonly used, with the following steps: 65 | 66 | - removing special characters, 67 | - removing numerals, 68 | - converting all alphabets to lower cases, 69 | - removing stop words, and 70 | - stemming the words (using Porter stemmer). 71 | 72 | This function calls :func:`~text_preprocessor`. 73 | 74 | :param stopwordsfile: file object of the list of stop words 75 | :type stopwordsfile: file 76 | :return: a function that preprocesses text according to the pipeline 77 | :rtype: function 78 | """ 79 | # load stop words file 80 | stopwordset = set([stopword.strip() for stopword in stopwordsfile]) 81 | stopwordsfile.close() 82 | 83 | # the pipeline 84 | pipeline = [lambda s: re.sub('[^\w\s]', '', s), 85 | lambda s: re.sub('[\d]', '', s), 86 | lambda s: s.lower(), 87 | lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))), 88 | lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) 89 | ] 90 | return text_preprocessor(pipeline) 91 | 92 | 93 | def standard_text_preprocessor_1(): 94 | """ Return a commonly used text preprocessor. 95 | 96 | Return a text preprocessor that is commonly used, with the following steps: 97 | 98 | - removing special characters, 99 | - removing numerals, 100 | - converting all alphabets to lower cases, 101 | - removing stop words (NLTK list), and 102 | - stemming the words (using Porter stemmer). 103 | 104 | This function calls :func:`~oldschool_standard_text_preprocessor`. 105 | 106 | :return: a function that preprocesses text according to the pipeline 107 | :rtype: function 108 | """ 109 | # load stop words 110 | this_dir, _ = os.path.split(__file__) 111 | stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8') 112 | 113 | return oldschool_standard_text_preprocessor(stopwordsfile) 114 | 115 | 116 | def standard_text_preprocessor_2(): 117 | """ Return a commonly used text preprocessor. 118 | 119 | Return a text preprocessor that is commonly used, with the following steps: 120 | 121 | - removing special characters, 122 | - removing numerals, 123 | - converting all alphabets to lower cases, 124 | - removing stop words (NLTK list minus negation terms), and 125 | - stemming the words (using Porter stemmer). 126 | 127 | This function calls :func:`~oldschool_standard_text_preprocessor`. 128 | 129 | :return: a function that preprocesses text according to the pipeline 130 | :rtype: function 131 | """ 132 | # load stop words 133 | this_dir, _ = os.path.split(__file__) 134 | stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') 135 | 136 | return oldschool_standard_text_preprocessor(stopwordsfile) 137 | -------------------------------------------------------------------------------- /shorttext/utils/transformers.py: -------------------------------------------------------------------------------- 1 | 2 | # reference: https://towardsdatascience.com/word-embeddings-in-2020-review-with-code-examples-11eb39a1ee6d 3 | 4 | import warnings 5 | 6 | import numpy as np 7 | import torch 8 | from transformers import BertTokenizer, BertModel 9 | 10 | 11 | class BERTObject: 12 | """ The base class for BERT model that contains the embedding model and the tokenizer. 13 | 14 | For more information, please refer to the following paper: 15 | 16 | Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv 17 | `_] 18 | 19 | """ 20 | def __init__(self, model=None, tokenizer=None, trainable=False, device='cpu'): 21 | """ The base class for BERT model that contains the embedding model and the tokenizer. 22 | 23 | :param model: BERT model (default: None, with model `bert-base-uncase` to be used) 24 | :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used) 25 | :param device: device the language model is stored (default: `cpu`) 26 | :type model: str 27 | :type tokenizer: str 28 | :type device: str 29 | """ 30 | if device == 'cuda': 31 | if torch.cuda.is_available(): 32 | self.device = torch.device('cuda') 33 | else: 34 | warnings.warn("CUDA is not available. Device set to 'cpu'.") 35 | self.device = torch.device('cpu') 36 | else: 37 | self.device = torch.device(device) 38 | 39 | self.trainable = trainable 40 | 41 | if model is None: 42 | self.model = BertModel.from_pretrained('bert-base-uncased', 43 | output_hidden_states=True)\ 44 | .to(self.device) 45 | else: 46 | self.model = model.to(self.device) 47 | 48 | if self.trainable: 49 | self.model.train() 50 | 51 | if tokenizer is None: 52 | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 53 | else: 54 | self.tokenizer = tokenizer 55 | 56 | self.number_hidden_layers = self.model.config.num_hidden_layers 57 | 58 | 59 | class WrappedBERTEncoder(BERTObject): 60 | """ This is the class that encodes sentences with BERT models. 61 | 62 | For more information, please refer to the following paper: 63 | 64 | Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv 65 | `_] 66 | 67 | """ 68 | def __init__( 69 | self, 70 | model=None, 71 | tokenizer=None, 72 | max_length=48, 73 | nbencodinglayers=4, 74 | trainable=False, 75 | device='cpu' 76 | ): 77 | """ This is the constructor of the class that encodes sentences with BERT models. 78 | 79 | :param model: BERT model (default: None, with model `bert-base-uncase` to be used) 80 | :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used) 81 | :param max_length: maximum number of tokens of each sentence (default: 48) 82 | :param nbencodinglayers: number of encoding layers (taking the last layers to encode the sentences, default: 4) 83 | :param device: device the language model is stored (default: `cpu`) 84 | :type model: str 85 | :type tokenizer: str 86 | :type max_length: int 87 | :type device: str 88 | """ 89 | super(WrappedBERTEncoder, self).__init__( 90 | model=model, 91 | tokenizer=tokenizer, 92 | trainable=trainable, 93 | device=device 94 | ) 95 | self.max_length = max_length 96 | self.nbencodinglayers = nbencodinglayers 97 | 98 | def encode_sentences(self, sentences, numpy=False): 99 | """ Encode the sentences into numerical vectors, given by a list of strings. 100 | 101 | It can output either torch tensors or numpy arrays. 102 | 103 | :param sentences: list of strings to encode 104 | :param numpy: output a numpy array if `True`; otherwise, output a torch tensor. (Default: `False`) 105 | :return: encoded vectors for the sentences 106 | :type sentences: list 107 | :type numpy: bool 108 | :rtype: numpy.array or torch.Tensor 109 | """ 110 | input_ids = [] 111 | tokenized_texts = [] 112 | 113 | for sentence in sentences: 114 | marked_text = '[CLS]' + sentence + '[SEP]' 115 | 116 | encoded_dict = self.tokenizer.encode_plus( 117 | sentence, 118 | add_special_tokens=True, 119 | truncation=True, 120 | max_length=self.max_length, 121 | padding='max_length', 122 | return_tensors='pt' 123 | ) 124 | 125 | tokenized_texts.append(self.tokenizer.tokenize(marked_text)) 126 | input_ids.append(encoded_dict['input_ids']) 127 | 128 | input_ids = torch.cat(input_ids, dim=0) 129 | segments_id = torch.LongTensor(np.array(input_ids > 0)) 130 | input_ids = input_ids.to(self.device) 131 | segments_id = segments_id.to(self.device) 132 | 133 | if self.trainable: 134 | output = self.model(input_ids, segments_id) 135 | sentences_embeddings = output[1] 136 | hidden_state = output[2] 137 | else: 138 | with torch.no_grad(): 139 | output = self.model(input_ids, segments_id) 140 | sentences_embeddings = output[1] 141 | hidden_state = output[2] 142 | 143 | alllayers_token_embeddings = torch.stack(hidden_state, dim=0) 144 | alllayers_token_embeddings = alllayers_token_embeddings.permute(1, 2, 0, 3) # swap dimensions to [sentence, tokens, hidden layers, features] 145 | processed_embeddings = alllayers_token_embeddings[:, :, (self.number_hidden_layers+1-self.nbencodinglayers):, :] 146 | 147 | token_embeddings = torch.reshape(processed_embeddings, (len(sentences), self.max_length, -1)) 148 | 149 | if numpy: 150 | sentences_embeddings = sentences_embeddings.detach().numpy() 151 | token_embeddings = token_embeddings.detach().numpy() 152 | 153 | return sentences_embeddings, token_embeddings, tokenized_texts 154 | -------------------------------------------------------------------------------- /shorttext/utils/wordembed.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import gensim 4 | from gensim.models import KeyedVectors 5 | from gensim.models.keyedvectors import KeyedVectors 6 | from gensim.models.poincare import PoincareModel, PoincareKeyedVectors 7 | import requests 8 | 9 | from .textpreprocessing import tokenize 10 | 11 | 12 | def load_word2vec_model(path, binary=True): 13 | """ Load a pre-trained Word2Vec model. 14 | 15 | :param path: path of the file of the pre-trained Word2Vec model 16 | :param binary: whether the file is in binary format (Default: True) 17 | :return: a pre-trained Word2Vec model 18 | :type path: str 19 | :type binary: bool 20 | :rtype: gensim.models.keyedvectors.KeyedVectors 21 | """ 22 | return KeyedVectors.load_word2vec_format(path, binary=binary) 23 | 24 | 25 | def load_fasttext_model(path, encoding='utf-8'): 26 | """ Load a pre-trained FastText model. 27 | 28 | :param path: path of the file of the pre-trained FastText model 29 | :return: a pre-trained FastText model 30 | :type path: str 31 | :rtype: gensim.models.keyedvectors.FastTextKeyedVectors 32 | """ 33 | return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding) 34 | 35 | 36 | def load_poincare_model(path, word2vec_format=True, binary=False): 37 | """ Load a Poincare embedding model. 38 | 39 | :param path: path of the file of the pre-trained Poincare embedding model 40 | :param word2vec_format: whether to load from word2vec format (default: True) 41 | :param binary: binary format (default: False) 42 | :return: a pre-trained Poincare embedding model 43 | :type path: str 44 | :type word2vec_format: bool 45 | :type binary: bool 46 | :rtype: gensim.models.poincare.PoincareKeyedVectors 47 | """ 48 | if word2vec_format: 49 | return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) 50 | else: 51 | return PoincareModel.load(path).kv 52 | 53 | 54 | def shorttext_to_avgvec(shorttext, wvmodel): 55 | """ Convert the short text into an averaged embedded vector representation. 56 | 57 | Given a short sentence, it converts all the tokens into embedded vectors according to 58 | the given word-embedding model, sums 59 | them up, and normalize the resulting vector. It returns the resulting vector 60 | that represents this short sentence. 61 | 62 | :param shorttext: a short sentence 63 | :param wvmodel: word-embedding model 64 | :return: an embedded vector that represents the short sentence 65 | :type shorttext: str 66 | :type wvmodel: gensim.models.keyedvectors.KeyedVectors 67 | :rtype: numpy.ndarray 68 | """ 69 | vec = np.sum( 70 | [ 71 | wvmodel[token] 72 | if token in wvmodel 73 | else np.array([1.]*wvmodel.vector_size) / np.sqrt(wvmodel.vector_size) 74 | for token in tokenize(shorttext) 75 | ], 76 | axis=0 77 | ) 78 | 79 | # normalize 80 | norm = np.linalg.norm(vec) 81 | if norm != 0: 82 | vec /= norm 83 | 84 | return vec 85 | 86 | 87 | class RESTfulKeyedVectors(KeyedVectors): 88 | """ RESTfulKeyedVectors, for connecting to the API of the preloaded word-embedding vectors loaded 89 | by `WordEmbedAPI`. 90 | 91 | This class inherits from :class:`gensim.models.keyedvectors.KeyedVectors`. 92 | 93 | """ 94 | def __init__(self, url, port='5000'): 95 | """ Initialize the class. 96 | 97 | :param url: URL of the API, usually `http://localhost` 98 | :param port: Port number 99 | :type url: str 100 | :type port: str 101 | """ 102 | self.url = url 103 | self.port = port 104 | 105 | def closer_than(self, entity1, entity2): 106 | """ 107 | 108 | :param entity1: word 1 109 | :param entity2: word 2 110 | :type entity1: str 111 | :type entity2: str 112 | :return: list of words 113 | :rtype: list 114 | """ 115 | r = requests.post(self.url + ':' + self.port + '/closerthan', 116 | json={'entity1': entity1, 'entity2': entity2}) 117 | return r.json() 118 | 119 | def distance(self, entity1, entity2): 120 | """ 121 | 122 | :param entity1: word 1 123 | :param entity2: word 2 124 | :type entity1: str 125 | :type entity2: str 126 | :return: distance between two words 127 | :rtype: float 128 | """ 129 | r = requests.post(self.url + ':' + self.port + '/distance', 130 | json={'entity1': entity1, 'entity2': entity2}) 131 | return r.json()['distance'] 132 | 133 | def distances(self, entity1, other_entities=()): 134 | """ 135 | 136 | :param entity1: word 137 | :param other_entities: list of words 138 | :type entity1: str 139 | :type other_entities: list 140 | :return: list of distances between `entity1` and each word in `other_entities` 141 | :rtype: list 142 | """ 143 | r = requests.post(self.url + ':' + self.port + '/distances', 144 | json={'entity1': entity1, 'other_entities': other_entities}) 145 | return np.array(r.json()['distances'], dtype=np.float32) 146 | 147 | def get_vector(self, entity): 148 | """ 149 | 150 | :param entity: word 151 | :type: str 152 | :return: word vectors of the given word 153 | :rtype: numpy.ndarray 154 | """ 155 | r = requests.post(self.url + ':' + self.port + '/get_vector', json={'token': entity}) 156 | returned_dict = r.json() 157 | if 'vector' in returned_dict: 158 | return np.array(returned_dict['vector']) 159 | else: 160 | raise KeyError('The token {} does not exist in the model.'.format(entity)) 161 | 162 | def most_similar(self, **kwargs): 163 | """ 164 | 165 | :param kwargs: 166 | :return: 167 | """ 168 | r = requests.post(self.url + ':' + self.port + '/most_similar', json=kwargs) 169 | return [tuple(pair) for pair in r.json()] 170 | 171 | def most_similar_to_given(self, entity1, entities_list): 172 | """ 173 | 174 | :param entity1: word 175 | :param entities_list: list of words 176 | :type entity1: str 177 | :type entities_list: list 178 | :return: list of similarities between the given word and each word in `entities_list` 179 | :rtype: list 180 | """ 181 | r = requests.post(self.url + ':' + self.port + '/most_similar_to_given', 182 | json={'entity1': entity1, 'entities_list': entities_list}) 183 | return r.json()['token'] 184 | 185 | def rank(self, entity1, entity2): 186 | """ 187 | 188 | :param entity1: word 1 189 | :param entity2: word 2 190 | :type entity1: str 191 | :type entity2: str 192 | :return: rank 193 | :rtype: int 194 | """ 195 | r = requests.post(self.url + ':' + self.port + '/rank', 196 | json={'entity1': entity1, 'entity2': entity2}) 197 | return r.json()['rank'] 198 | 199 | def save(self, fname_or_handle, **kwargs): 200 | """ 201 | 202 | :param fname_or_handle: 203 | :param kwargs: 204 | :return: 205 | """ 206 | raise IOError('The class RESTfulKeyedVectors do not persist models to a file.') 207 | 208 | def similarity(self, entity1, entity2): 209 | """ 210 | 211 | :param entity1: word 1 212 | :param entity2: word 2 213 | :return: similarity between two words 214 | :type entity1: str 215 | :type entity2: str 216 | :rtype: float 217 | """ 218 | r = requests.post(self.url + ':' + self.port + '/similarity', 219 | json={'entity1': entity1, 'entity2': entity2}) 220 | return r.json()['similarity'] 221 | 222 | # reference: https://radimrehurek.com/gensim/models/keyedvectors.html 223 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package has automated unit-tests for shorttext. 3 | """ 4 | -------------------------------------------------------------------------------- /test/test_charonehot.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | from urllib.request import urlopen 4 | 5 | import shorttext 6 | 7 | 8 | class TestCharOneHot(unittest.TestCase): 9 | def test_BigTxt(self): 10 | chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder( 11 | urlopen('http://norvig.com/big.txt'), 12 | encoding='utf-8' 13 | ) 14 | self.assertEqual(93, len(chartovec_encoder.dictionary)) 15 | self.assertEqual('\n', chartovec_encoder.signalchar) 16 | 17 | 18 | if __name__ == '__main__': 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /test/test_dtm.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import re 4 | 5 | import pandas as pd 6 | import shorttext 7 | from shorttext.utils import stemword, tokenize 8 | 9 | 10 | class TestDTM(unittest.TestCase): 11 | def test_inaugural(self): 12 | # preparing data 13 | usprez = shorttext.data.inaugural() 14 | docids = sorted(usprez.keys()) 15 | usprez = [' '.join(usprez[docid]) for docid in docids] 16 | usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez}) 17 | usprezdf = usprezdf[['yrprez', 'speech']] 18 | 19 | # preprocesser defined 20 | pipeline = [lambda s: re.sub('[^\w\s]', '', s), 21 | lambda s: re.sub('[\d]', '', s), 22 | lambda s: s.lower(), 23 | lambda s: ' '.join([stemword(token) for token in tokenize(s)]) 24 | ] 25 | txtpreprocessor = shorttext.utils.text_preprocessor(pipeline) 26 | 27 | # corpus making 28 | docids = list(usprezdf['yrprez']) 29 | corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']] 30 | 31 | # making DTM 32 | dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) 33 | 34 | # check results 35 | self.assertEqual(len(dtm.dictionary), 5256) 36 | self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138, 37 | places=3) 38 | numdocs, numtokens = dtm.dtm.shape 39 | self.assertEqual(numdocs, 56) 40 | self.assertEqual(numtokens, 5256) 41 | self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407, 42 | places=3) 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /test/test_fuzzylogic.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | import shorttext 5 | 6 | 7 | class TestFuzzyLogic(unittest.TestCase): 8 | def test_similarity(self): 9 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('debug', 'deubg'), 1) 10 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('intrdependence', 'interdpeendencae'), 3) 11 | self.assertEqual(shorttext.metrics.dynprog.lcp.longest_common_prefix('debug', 'debuag'), 4) 12 | 13 | def test_transposition(self): 14 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('independent', 'indeepndent'), 1) 15 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('providence', 'porvidecne'), 2) 16 | 17 | def test_insertion(self): 18 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorithms'), 1) 19 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoarithmm'), 2) 20 | 21 | def test_deletion(self): 22 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoithm'), 1) 23 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorith'), 1) 24 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algrihm'), 2) 25 | 26 | def test_correct(self): 27 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('python', 'python'), 0) 28 | self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('sosad', 'sosad'), 0) 29 | 30 | def test_jaccard(self): 31 | self.assertAlmostEqual(shorttext.metrics.dynprog.jaccard.similarity('diver', 'driver'), 5./6.) 32 | 33 | if __name__ == '__main__': 34 | unittest.main() -------------------------------------------------------------------------------- /test/test_norvigspell.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | from urllib.request import urlopen 4 | 5 | import shorttext 6 | 7 | 8 | class TestSpellCheck(unittest.TestCase): 9 | def setUp(self): 10 | self.text = urlopen('http://norvig.com/big.txt').read() 11 | self.text = self.text.decode('utf-8') 12 | 13 | def test_norvig(self): 14 | speller = shorttext.spell.NorvigSpellCorrector() 15 | speller.train(self.text) 16 | self.assertEqual(speller.correct('apple'), 'apple') 17 | self.assertEqual(speller.correct('appl'), 'apply') 18 | 19 | 20 | if __name__ == '__main__': 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /test/test_sakaguchispell.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import os 4 | 5 | from shorttext.spell.sakaguchi import SCRNNSpellCorrector 6 | from shorttext.smartload import smartload_compact_model 7 | 8 | 9 | class TestSCRNN(unittest.TestCase): 10 | def generalproc(self, operation, typo='langudge', recommendation='language'): 11 | corrector = SCRNNSpellCorrector(operation) 12 | corrector.train('I am a nerd . Natural language processing is sosad .') 13 | corrector.save_compact_model('./sosad_'+operation+'_sakaguchi.bin') 14 | 15 | corrector2 = smartload_compact_model('./sosad_'+operation+'_sakaguchi.bin', None) 16 | self.assertEqual(corrector.correct(typo), corrector2.correct(typo)) 17 | 18 | print('typo: '+typo+' recommendation: '+corrector.correct(typo)+' ('+recommendation+')') 19 | 20 | os.remove('./sosad_'+operation+'_sakaguchi.bin') 21 | 22 | def test_NOISE_INSERT(self): 23 | self.generalproc('NOISE-INSERT') 24 | 25 | def test_NOISE_DELETE(self): 26 | self.generalproc('NOISE-DELETE') 27 | 28 | def test_NOISE_REPLACE(self): 29 | self.generalproc('NOISE-REPLACE', typo='procsesing', recommendation='processing') 30 | 31 | def test_JUMBLE_WHOLE(self): 32 | self.generalproc('JUMBLE-WHOLE') 33 | 34 | def test_JUMBLE_BEG(self): 35 | self.generalproc('JUMBLE-BEG') 36 | 37 | def test_JUMBLE_END(self): 38 | self.generalproc('JUMBLE-END') 39 | 40 | def test_JUMBLE_INT(self): 41 | self.generalproc('JUMBLE-INT') 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | 47 | -------------------------------------------------------------------------------- /test/test_stacking.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import os 4 | 5 | import shorttext 6 | from shorttext.stack import LogisticStackedGeneralization 7 | from shorttext.smartload import smartload_compact_model 8 | from sklearn.svm import SVC 9 | 10 | 11 | class TestStacking(unittest.TestCase): 12 | def setUp(self): 13 | self.nihdict = shorttext.data.nihreports(sample_size=None) 14 | 15 | def tearDown(self): 16 | for filepath in os.listdir('.'): 17 | if filepath.endswith('.bin'): 18 | os.remove(os.path.join('.', filepath)) 19 | 20 | def training_stacking(self): 21 | # loading NIH Reports 22 | nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']} 23 | 24 | # maxent 25 | maxent_classifier = shorttext.classifiers.MaxEntClassifier() 26 | maxent_classifier.train(nihdict, nb_epochs=100) 27 | maxent_classifier.save_compact_model('./bio_maxent.bin') 28 | 29 | # SVM + LDA 30 | topicmodeler = shorttext.generators.LDAModeler() 31 | topicmodeler.train(nihdict, 8) 32 | topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) 33 | topicmodeler.save_compact_model('./bio_lda.bin') 34 | svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC()) 35 | svm_classifier.train(nihdict) 36 | svm_classifier.save_compact_model('./bio_svm.bin') 37 | 38 | # logistic 39 | stacked_classifier = LogisticStackedGeneralization({'maxent': maxent_classifier, 40 | 'svm': svm_classifier, 41 | 'topiccosine': topicdisclassifier}) 42 | stacked_classifier.train(nihdict) 43 | stacked_classifier.save_compact_model('./bio_logistics.bin') 44 | 45 | return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier 46 | 47 | def comparedict(self, dict1, dict2): 48 | self.assertTrue(len(dict1)==len(dict2)) 49 | print(dict1, dict2) 50 | for classlabel in dict1: 51 | self.assertTrue(classlabel in dict2) 52 | self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4) 53 | 54 | def testStudies(self): 55 | # train 56 | maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = self.training_stacking() 57 | topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) 58 | 59 | # smartload 60 | maxent_classifier2 = smartload_compact_model('./bio_maxent.bin', None) 61 | topicmodeler2 = smartload_compact_model('./bio_lda.bin', None) 62 | topicdisclassifier2 = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler2) 63 | svm_classifier2 = smartload_compact_model('./bio_svm.bin', None) 64 | stacked_classifier2 = LogisticStackedGeneralization({'maxent': maxent_classifier2, 65 | 'svm': svm_classifier2, 66 | 'topiccosine': topicdisclassifier2}) 67 | stacked_classifier2.load_compact_model('./bio_logistics.bin') 68 | 69 | # compare 70 | terms = ['stem cell', 'grant', 'system biology'] 71 | for term in terms: 72 | print(term) 73 | print('maximum entropy') 74 | self.comparedict(maxent_classifier.score(term), maxent_classifier2.score(term)) 75 | print('LDA') 76 | self.comparedict(topicdisclassifier.score(term), topicdisclassifier2.score(term)) 77 | print('SVM') 78 | self.comparedict(svm_classifier.score(term), svm_classifier2.score(term)) 79 | print('combined') 80 | self.comparedict(stacked_classifier.score(term), stacked_classifier2.score(term)) 81 | 82 | def testSVM(self): 83 | # loading NIH Reports 84 | nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']} 85 | 86 | # svm 87 | topicmodeler = shorttext.generators.LDAModeler() 88 | topicmodeler.train(nihdict, 16) 89 | svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC()) 90 | svm_classifier.train(nihdict) 91 | print('before saving...') 92 | print('--'.join(svm_classifier.classlabels)) 93 | print('--'.join(svm_classifier.topicmodeler.classlabels)) 94 | svm_classifier.save_compact_model('./bio_svm2.bin') 95 | print('after saving...') 96 | print('--'.join(svm_classifier.classlabels)) 97 | print('--'.join(svm_classifier.topicmodeler.classlabels)) 98 | 99 | # load 100 | svm_classifier2 = smartload_compact_model('./bio_svm2.bin', None) 101 | print('second classifier...') 102 | print(','.join(svm_classifier2.classlabels)) 103 | print(','.join(svm_classifier2.topicmodeler.classlabels)) 104 | 105 | # compare 106 | terms = ['stem cell', 'grant', 'system biology'] 107 | for term in terms: 108 | print(term) 109 | topicvec = svm_classifier.getvector(term) 110 | topicvec2 = svm_classifier2.getvector(term) 111 | print(topicvec) 112 | print(topicvec2) 113 | for idx, classlabel in enumerate(svm_classifier.classlabels): 114 | print(str(idx)+' '+classlabel) 115 | print(svm_classifier.classifier.score([topicvec], [idx])) 116 | for idx, classlabel in enumerate(svm_classifier2.classlabels): 117 | print(str(idx) + ' ' + classlabel) 118 | print(svm_classifier2.classifier.score([topicvec2], [idx])) 119 | print({classlabel: svm_classifier.classifier.score([topicvec], [idx]) 120 | for idx, classlabel in enumerate(svm_classifier.classlabels)}) 121 | print({classlabel: svm_classifier2.classifier.score([topicvec], [idx]) 122 | for idx, classlabel in enumerate(svm_classifier2.classlabels)}) 123 | 124 | for term in terms: 125 | print(term) 126 | self.comparedict(svm_classifier.score(term), svm_classifier2.score(term)) 127 | 128 | 129 | if __name__ == '__main__': 130 | unittest.main() 131 | 132 | -------------------------------------------------------------------------------- /test/test_textpreprocessing.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | import shorttext 5 | 6 | class TestTextPreprocessing(unittest.TestCase): 7 | def testStandardPipeline(self): 8 | preprocessor = shorttext.utils.standard_text_preprocessor_1() 9 | self.assertEqual(preprocessor('I love you.'), 'love') 10 | self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire') 11 | self.assertEqual(preprocessor('I do not think.'), 'think') 12 | 13 | def testStandPipelineDifferentStopwords(self): 14 | preprocessor = shorttext.utils.standard_text_preprocessor_2() 15 | self.assertEqual(preprocessor('I love you.'), 'love') 16 | self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire') 17 | self.assertEqual(preprocessor('I do not think.'), 'not think') 18 | 19 | 20 | if __name__ == '__main__': 21 | unittest.main() -------------------------------------------------------------------------------- /test/test_var_nn_embedded_vec_classifier.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | import urllib 5 | 6 | import shorttext 7 | 8 | 9 | class TestVarNNEmbeddedVecClassifier(unittest.TestCase): 10 | def setUp(self): 11 | print("Downloading word-embedding model....") 12 | link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" 13 | filename = "test_w2v_model.bin" 14 | if not os.path.isfile("test_w2v_model.bin"): 15 | urllib.request.urlretrieve(link, filename) 16 | self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model 17 | self.trainclass_dict = shorttext.data.subjectkeywords() # load training data 18 | 19 | def tearDown(self): 20 | print("Removing word-embedding model") 21 | if os.path.isfile("test_w2v_model.bin"): 22 | os.remove('test_w2v_model.bin') 23 | 24 | def comparedict(self, dict1, dict2): 25 | self.assertTrue(len(dict1)==len(dict2)) 26 | print(dict1, dict2) 27 | for classlabel in dict1: 28 | self.assertTrue(classlabel in dict2) 29 | self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4) 30 | 31 | def testCNNWordEmbedWithoutGensim(self): 32 | print("Testing CNN...") 33 | # create keras model using `CNNWordEmbed` class 34 | print("\tKeras model") 35 | keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, 36 | nb_labels=len(self.trainclass_dict.keys())) 37 | 38 | # create and train classifier using keras model constructed above 39 | print("\tTraining") 40 | main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) 41 | main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) 42 | 43 | # compute classification score 44 | print("\tTesting") 45 | score_vals = main_classifier.score('artificial intelligence') 46 | self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) 47 | 48 | def testDoubleCNNWordEmbedWithoutGensim(self): 49 | print("Testing DoubleCNN...") 50 | # create keras model using `DoubleCNNWordEmbed` class 51 | print("\tKeras model") 52 | keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, 53 | nb_labels=len(self.trainclass_dict.keys())) 54 | 55 | # create and train classifier using keras model constructed above 56 | print("\tTraining") 57 | main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) 58 | main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) 59 | 60 | # compute classification score 61 | print("\tTesting") 62 | score_vals = main_classifier.score('artificial intelligence') 63 | self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) 64 | 65 | def testCLSTMWordEmbedWithoutGensim(self): 66 | print("Testing CLSTM...") 67 | # create keras model using `CLSTMWordEmbed` class 68 | print("\tKeras model") 69 | keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, 70 | nb_labels=len(self.trainclass_dict.keys())) 71 | 72 | # create and train classifier using keras model constructed above 73 | print("\tTraining") 74 | main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) 75 | main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) 76 | 77 | # compute classification score 78 | print("\tTesting") 79 | score_vals = main_classifier.score('artificial intelligence') 80 | self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) 81 | 82 | def testAASumEmbed(self): 83 | print("Testing SumEmbed") 84 | classifier = shorttext.classifiers.SumEmbeddedVecClassifier(self.w2v_model) 85 | classdict = shorttext.data.subjectkeywords() 86 | classifier.train(classdict) 87 | 88 | # compute 89 | self.comparedict(classifier.score('linear algebra'), 90 | {'mathematics': 0.9044698253778962, 91 | 'physics': 0.7586816549044926, 92 | 'theology': 0.1817602793151848}) 93 | self.comparedict(classifier.score('learning'), 94 | {'mathematics': 0.9037142562255835, 95 | 'physics': 0.7588376500004107, 96 | 'theology': 0.18039468994239538}) 97 | self.comparedict(classifier.score('eschatology'), 98 | {'mathematics': 0.3658578123294476, 99 | 'physics': 0.5996711864493821, 100 | 'theology': 0.9694560847986978}) 101 | 102 | 103 | if __name__ == '__main__': 104 | unittest.main() 105 | -------------------------------------------------------------------------------- /test/test_wmd.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import urllib 4 | 5 | from shorttext.metrics.wasserstein import word_mover_distance 6 | from shorttext.utils import load_word2vec_model 7 | 8 | 9 | class TestWMD(unittest.TestCase): 10 | def setUp(self): 11 | print("Downloading word-embedding model....") 12 | link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" 13 | filename = "test_w2v_model.bin" 14 | if not os.path.isfile("test_w2v_model.bin"): 15 | urllib.request.urlretrieve(link, filename) 16 | self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model 17 | 18 | def tearDown(self): 19 | print("Removing word-embedding model") 20 | if os.path.isfile("test_w2v_model.bin"): 21 | os.remove('test_w2v_model.bin') 22 | 23 | def calculate_wmd(self, tokens1, tokens2, answer): 24 | wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) 25 | self.assertAlmostEqual(wdistance, answer, delta=1e-3) 26 | 27 | def test_metrics(self): 28 | tokens1 = ['president', 'speaks'] 29 | tokens2 = ['president', 'talks'] 30 | known_answer = 0.19936788082122803 31 | self.calculate_wmd(tokens1, tokens2, known_answer) 32 | 33 | tokens1 = ['fan', 'book'] 34 | tokens2 = ['apple', 'orange'] 35 | known_answer = 1.8019972145557404 36 | self.calculate_wmd(tokens1, tokens2, known_answer) 37 | 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | --------------------------------------------------------------------------------