├── .circleci
    └── config.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── codes.rst
    ├── conf.py
    ├── faq.rst
    ├── images
    │   ├── nnlib_clstm.png
    │   └── nnlib_cnn.png
    ├── index.rst
    ├── install.rst
    ├── intro.rst
    ├── links.rst
    ├── news.rst
    ├── refs.rst
    ├── requirements.txt
    ├── scripts.rst
    ├── tutorial.rst
    ├── tutorial_charbaseonehot.rst
    ├── tutorial_charbaseseq2seq.rst
    ├── tutorial_dataprep.rst
    ├── tutorial_dtm.rst
    ├── tutorial_maxent.rst
    ├── tutorial_metrics.rst
    ├── tutorial_nnlib.rst
    ├── tutorial_spell.rst
    ├── tutorial_stacking.rst
    ├── tutorial_sumvec.rst
    ├── tutorial_textpreprocessing.rst
    ├── tutorial_topic.rst
    └── tutorial_wordembed.rst
├── pyproject.toml
├── shorttext
    ├── __init__.py
    ├── classifiers
    │   ├── __init__.py
    │   ├── bow
    │   │   ├── __init__.py
    │   │   ├── maxent
    │   │   │   ├── MaxEntClassification.py
    │   │   │   └── __init__.py
    │   │   └── topic
    │   │   │   ├── SkLearnClassification.py
    │   │   │   ├── TopicVectorDistanceClassification.py
    │   │   │   └── __init__.py
    │   └── embed
    │   │   ├── __init__.py
    │   │   ├── nnlib
    │   │       ├── VarNNEmbedVecClassification.py
    │   │       ├── __init__.py
    │   │       └── frameworks.py
    │   │   └── sumvec
    │   │       ├── SumEmbedVecClassification.py
    │   │       ├── VarNNSumEmbedVecClassification.py
    │   │       ├── __init__.py
    │   │       └── frameworks.py
    ├── cli
    │   ├── __init__.py
    │   ├── categorization.py
    │   └── wordembedsim.py
    ├── data
    │   ├── __init__.py
    │   ├── data_retrieval.py
    │   └── shorttext_exampledata.csv
    ├── generators
    │   ├── __init__.py
    │   ├── bow
    │   │   ├── AutoEncodingTopicModeling.py
    │   │   ├── GensimTopicModeling.py
    │   │   ├── LatentTopicModeling.py
    │   │   └── __init__.py
    │   ├── charbase
    │   │   ├── __init__.py
    │   │   └── char2vec.py
    │   └── seq2seq
    │   │   ├── __init__.py
    │   │   ├── charbaseS2S.py
    │   │   └── s2skeras.py
    ├── metrics
    │   ├── __init__.py
    │   ├── dynprog
    │   │   ├── __init__.py
    │   │   ├── dldist.py
    │   │   ├── jaccard.py
    │   │   └── lcp.py
    │   ├── embedfuzzy
    │   │   ├── __init__.py
    │   │   └── jaccard.py
    │   ├── transformers
    │   │   ├── __init__.py
    │   │   └── bertscore.py
    │   └── wasserstein
    │   │   ├── __init__.py
    │   │   └── wordmoverdist.py
    ├── smartload.py
    ├── spell
    │   ├── __init__.py
    │   ├── basespellcorrector.py
    │   ├── binarize.py
    │   ├── editor.py
    │   ├── norvig.py
    │   └── sakaguchi.py
    ├── stack
    │   ├── __init__.py
    │   └── stacking.py
    └── utils
    │   ├── __init__.py
    │   ├── classification_exceptions.py
    │   ├── compactmodel_io.py
    │   ├── dtm.py
    │   ├── gensim_corpora.py
    │   ├── kerasmodel_io.py
    │   ├── misc.py
    │   ├── nonneg_stopwords.txt
    │   ├── stopwords.txt
    │   ├── textpreprocessing.py
    │   ├── transformers.py
    │   └── wordembed.py
└── test
    ├── __init__.py
    ├── test_charonehot.py
    ├── test_dtm.py
    ├── test_fuzzylogic.py
    ├── test_norvigspell.py
    ├── test_sakaguchispell.py
    ├── test_stacking.py
    ├── test_textpreprocessing.py
    ├── test_var_nn_embedded_vec_classifier.py
    └── test_wmd.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | 
 4 | shared: &shared
 5 |   working_directory: ~/shorttext
 6 | 
 7 |   steps:
 8 |     - checkout
 9 | 
10 |     - run:
11 |         name: Apt Install
12 |         command: |
13 |           sudo apt-get update
14 |           sudo apt-get install libc6
15 |           sudo apt-get install python3-dev
16 |           sudo apt-get install -y g++
17 | 
18 |     - run:
19 |         name: Installing Miniconda and Packages
20 |         command: |
21 |           pip install --upgrade --user pip
22 |           pip install --upgrade --user google-compute-engine
23 |           pip install --user  .
24 | 
25 |     - run:
26 |         name: Run Unit Tests
27 |         command: |
28 |           pip install --user .[test]
29 |           pytest
30 | 
31 | 
32 | jobs:
33 |   py39:
34 |     <<: *shared
35 |     docker:
36 |       - image: cimg/python:3.9
37 | 
38 |   py310:
39 |     <<: *shared
40 |     docker:
41 |       - image: cimg/python:3.10
42 | 
43 |   py311:
44 |     <<: *shared
45 |     docker:
46 |       - image: cimg/python:3.11
47 | 
48 |   py312:
49 |     <<: *shared
50 |     docker:
51 |       - image: cimg/python:3.12
52 | 
53 | 
54 | workflows:
55 |   version: 2
56 |   build:
57 |     jobs:
58 |       - py39
59 |       - py310
60 |       - py311
61 |       - py312
62 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | build:
13 |   os: ubuntu-22.04
14 |   tools:
15 |     python: "3.10"
16 | 
17 | # Build documentation with MkDocs
18 | #mkdocs:
19 | #  configuration: mkdocs.yml
20 | 
21 | # Optionally build your docs in additional formats such as PDF and ePub
22 | formats: all
23 | 
24 | # Optionally set the version of Python and requirements required to build your docs
25 | python:
26 |   install:
27 |     - requirements: docs/requirements.txt
28 | 
29 | # conda environment
30 | #conda:
31 | #  environment: environment.yml


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 Kwan Yuet Stephen Ho
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include pyproject.toml
4 | include shorttext/data/shorttext_exampledata.csv
5 | include shorttext/utils/stopwords.txt
6 | include shorttext/utils/nonneg_stopwords.txt
7 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/shorttext.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/shorttext.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/shorttext"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/shorttext"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 


--------------------------------------------------------------------------------
/docs/codes.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | API unlisted in tutorials are listed here.
 5 | 
 6 | Shorttext Models Smart Loading
 7 | ------------------------------
 8 | 
 9 | .. automodule:: shorttext.smartload
10 |    :members:
11 | 
12 | Supervised Classification using Word Embedding
13 | ----------------------------------------------
14 | 
15 | Module `shorttext.generators.seq2seq.s2skeras`
16 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17 | 
18 | .. automodule:: shorttext.generators.seq2seq.s2skeras
19 |    :members:
20 | 
21 | 
22 | Module `shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification`
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | 
25 | .. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification
26 |    :members:
27 | 
28 | 
29 | Neural Networks
30 | ---------------
31 | 
32 | Module `shorttext.classifiers.embed.sumvec.frameworks`
33 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
34 | 
35 | .. automodule:: shorttext.classifiers.embed.sumvec.frameworks
36 |    :members:
37 | 
38 | 
39 | Utilities
40 | ---------
41 | 
42 | Module `shorttext.utils.kerasmodel_io`
43 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44 | 
45 | .. automodule:: shorttext.utils.kerasmodel_io
46 |    :members:
47 | 
48 | Module `shorttext.utils.gensim_corpora`
49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50 | 
51 | .. automodule:: shorttext.utils.gensim_corpora
52 |    :members:
53 | 
54 | Module `shorttext.utils.compactmodel_io`
55 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
56 | 
57 | .. automodule:: shorttext.utils.compactmodel_io
58 |    :members:
59 | 
60 | 
61 | Metrics
62 | -------
63 | 
64 | Module `shorttext.metrics.dynprog`
65 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
66 | 
67 | .. automodule:: shorttext.metrics.dynprog.jaccard
68 |    :members:
69 | 
70 | .. automodule:: shorttext.metrics.dynprog.dldist
71 |    :members:
72 | 
73 | .. automodule:: shorttext.metrics.dynprog.lcp
74 |    :members:
75 | 
76 | Module `shorttext.metrics.wassersterin`
77 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
78 | 
79 | .. automodule:: shorttext.metrics.wasserstein.wordmoverdist
80 |    :members: word_mover_distance_linprog
81 | 
82 | Spell Correction
83 | ----------------
84 | 
85 | Module `shorttext.spell`
86 | ^^^^^^^^^^^^^^^^^^^^^^^^
87 | 
88 | .. automodule:: shorttext.spell.basespellcorrector
89 |    :members:
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
 1 | Frequently Asked Questions (FAQ)
 2 | ================================
 3 | 
 4 | **Q1. Can we use backends other than TensorFlow?**
 5 | 
 6 | Ans: No.
 7 | 
 8 | 
 9 | **Q2. Can we use word-embedding algorithms other than Word2Vec?**
10 | 
11 | Ans: Yes. Besides Word2Vec, you can use FastText and Poincaré embedding. See: :doc:`tutorial_wordembed` .
12 | 
13 | 
14 | **Q3. Can this package work on Python 2?**
15 | 
16 | Ans: No.
17 | 
18 | 
19 | 
20 | **Q4. How should I cite `shorttext` if I use it in my research?**
21 | 
22 | Ans: For the time being, You do not have to cite a particular paper for using this package.
23 | However, if you use any particular functions or class, check out the docstring. If there is a paper (or papers)
24 | mentioned, cite those papers. For example, if you use `CNNWordEmbed` in `frameworks
25 | <https://github.com/stephenhky/PyShortTextCategorization/blob/master/shorttext/classifiers/embed/nnlib/frameworks.py>`_,
26 | according to the docstring, cite Yoon Kim's paper. Refer to this documentation for the reference too.
27 | 
28 | 
29 | 
30 | **Q5. I am having trouble in install `shorttext` on Google Cloud Platform. What should I do?**
31 | 
32 | Ans: There is no "Python.h". Run: `sudo apt-get install python3-dev` in SSH shell of the VM instance.
33 | 
34 | **Q8. My model files were created by `shorttext` version < 2.0.0. How do I make them readable for version >= 2.0.0?
35 | 
36 | Ans: Simply make those files with names ending with `.h5` to `.weights.h5`.
37 | 
38 | 
39 | 
40 | Home: :doc:`index`
41 | 


--------------------------------------------------------------------------------
/docs/images/nnlib_clstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/docs/images/nnlib_clstm.png


--------------------------------------------------------------------------------
/docs/images/nnlib_cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/docs/images/nnlib_cnn.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. shorttext documentation master file, created by
 2 |    sphinx-quickstart on Fri Nov 11 18:11:01 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Homepage of `shorttext`
 7 | =======================
 8 | 
 9 | This repository is a collection of algorithms for multi-class classification to short texts using Python.
10 | Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report
11 | issues through the Issue_ tab of the Github_ page. This is a PyPI_ project. This is an open-source
12 | project under the `MIT License
13 | <https://github.com/stephenhky/PyShortTextCategorization/blob/master/LICENSE>`_ .
14 | 
15 | Contents:
16 | 
17 | .. toctree::
18 |    :maxdepth: 1
19 | 
20 |    intro
21 |    install
22 |    tutorial
23 |    scripts
24 |    codes
25 |    faq
26 |    refs
27 |    links
28 |    news
29 | 
30 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization
31 | .. _Issue: https://github.com/stephenhky/PyShortTextCategorization/issues
32 | .. _PyPI: https://pypi.org/project/shorttext/
33 | 
34 | Indices and tables
35 | ==================
36 | 
37 | * :ref:`genindex`
38 | * :ref:`modindex`
39 | * :ref:`search`
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | PIP
 5 | ---
 6 | 
 7 | Package `shorttext` runs in Python 3.9, 3.10, 3.11, and 3.12. However, for Python>=3.7, the backend
 8 | of keras_ cannot be Tensorflow_.
 9 | 
10 | To install the package in Linux or OS X, enter the following in the console:
11 | 
12 | ::
13 | 
14 |    pip install shorttext
15 | 
16 | It is very possible that you have to do it as root, that you have to add ``sudo`` in
17 | front of the command.
18 | 
19 | On the other hand, to get the development version on Github, you can install from Github_:
20 | 
21 | ::
22 | 
23 |     pip install git+https://github.com/stephenhky/PyShortTextCategorization@master
24 | 
25 | 
26 | Backend for Keras
27 | -----------------
28 | 
29 | We use TensorFlow for `keras`.
30 | 
31 | Possible Solutions for Installation Failures
32 | --------------------------------------------
33 | 
34 | Most developers can install `shorttext` with the instructions above. If the installation fails,
35 | you may try one (or more) of the following:
36 | 
37 | 1. Installing Python-dev by typing:
38 | 
39 | 
40 | ::
41 | 
42 |     pip install python3-dev
43 | 
44 | 
45 | 
46 | 2. Installing `gcc` by entering
47 | 
48 | ::
49 | 
50 |     apt-get install libc6
51 | 
52 | 
53 | 
54 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization
55 | 
56 | 
57 | Home: :doc:`index`
58 | 
59 | .. _Numpy: http://www.numpy.org/
60 | .. _SciPy: https://www.scipy.org/
61 | .. _Scikit-Learn: http://scikit-learn.org/stable/
62 | .. _Tensorflow: https://www.tensorflow.org/
63 | .. _Theano: http://deeplearning.net/software/theano/
64 | .. _CNTK: https://github.com/Microsoft/CNTK/wiki
65 | .. _keras: https://keras.io/
66 | .. _gensim: https://radimrehurek.com/gensim/
67 | .. _Pandas: http://pandas.pydata.org/
68 | .. _snowballstemmer: https://github.com/snowballstem/snowball
69 | .. _Joblib: https://joblib.readthedocs.io/en/latest/


--------------------------------------------------------------------------------
/docs/intro.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | This package `shorttext` is a Python package that facilitates supervised and unsupervised
 5 | learning for short text categorization. Due to the sparseness of words and
 6 | the lack of information carried in the short texts themselves, an intermediate
 7 | representation of the texts and documents are needed before they are put into
 8 | any classification algorithm. In this package, it facilitates various types
 9 | of these representations, including topic modeling and word-embedding algorithms.
10 | 
11 | The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12.
12 | 
13 | Characteristics:
14 | 
15 | - example data provided (including subject keywords and NIH RePORT); (see :doc:`tutorial_dataprep`)
16 | - text preprocessing; (see :doc:`tutorial_textpreprocessing`)
17 | - pre-trained word-embedding support; (see :doc:`tutorial_wordembed`)
18 | - `gensim` topic models (LDA, LSI, Random Projections) and autoencoder; (see :doc:`tutorial_topic`)
19 | - topic model representation supported for supervised learning using `scikit-learn`; (see :doc:`tutorial_topic`)
20 | - cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_sumvec`)
21 | - neural network classification (including ConvNet, and C-LSTM); (see :doc:`tutorial_nnlib`)
22 | - maximum entropy classification; (see :doc:`tutorial_maxent`)
23 | - metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); (see :doc:`tutorial_metrics`)
24 | - character-level sequence-to-sequence (seq2seq) learning; (see :doc:`tutorial_charbaseseq2seq`)
25 | - spell correction; (see :doc:`tutorial_spell`)
26 | - Sentence encodings and similarities based on BERT (see :doc:`tutorial_wordembed` and :doc:`tutorial_metrics`).
27 | 
28 | Author: Kwan Yuet Stephen Ho (LinkedIn_, ResearchGate_, Twitter_)
29 | Other contributors: `Chinmaya Pancholi <https://www.linkedin.com/in/cpancholi>`_, `Minseo Kim <https://kmingseo.github.io/>`_
30 | 
31 | Home: :doc:`index`
32 | 
33 | .. _LinkedIn: https://www.linkedin.com/in/kwan-yuet-ho-19882530
34 | .. _ResearchGate: https://www.researchgate.net/profile/Kwan-yuet_Ho
35 | .. _Twitter: https://twitter.com/stephenhky
36 | 


--------------------------------------------------------------------------------
/docs/links.rst:
--------------------------------------------------------------------------------
 1 | Links
 2 | =====
 3 | 
 4 | Project Codes and Package
 5 | -------------------------
 6 | 
 7 | - Github_
 8 | - PyPI_
 9 | 
10 | .. _Github: https://github.com/stephenhky/PyShortTextCategorization
11 | 
12 | .. _PyPI: https://pypi.org/project/shorttext/
13 | 
14 | Issues
15 | ------
16 | 
17 | To report bugs and issues, please go to Issues_.
18 | 
19 | .. _Issues: https://github.com/stephenhky/PyShortTextCategorization/issues
20 | 
21 | Gensim Incubator
22 | ----------------
23 | 
24 | Chinmaya Pancholi, a student in Indian Institute of Technology, Kharagpur, is supported
25 | by Google Summer of Code (GSoC) project to support the open-source project for `gensim`.
26 | Part of his project is to employ the wrapping ideas in `shorttext` to integrate `keras`,
27 | `scikit-learn` and `gensim`.
28 | 
29 | Chinmaya's blog posts: `https://rare-technologies.com/author/chinmaya/
30 | <https://rare-technologies.com/author/chinmaya/>`_
31 | 
32 | Chinmaya's proposal for GSoC: `https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md
33 | <https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md>`_
34 | 
35 | 
36 | Blog Entries
37 | ------------
38 | 
39 | "R or Python on Text Mining," *Everything About Data Analytics*, WordPress (2015). [`WordPress
40 | <https://datawarrior.wordpress.com/2015/08/12/codienerd-1-r-or-python-on-text-mining>`_]
41 | 
42 | "Short Text Categorization using Deep Neural Networks and Word-Embedding Models," *Everything About Data Analytics*, WordPress (2015). [`WordPress
43 | <https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/>`_]
44 | (A code demonstration can be found in an early version of the Github repository for this package: `here
45 | <https://github.com/stephenhky/PyShortTextCategorization/tree/b298d3ce7d06a9b4e0f7d32f27bab66064ba7afa>`_)
46 | 
47 | "Toying with Word2Vec," *Everything About Data Analytics*, WordPress (2015). [`WordPress
48 | <https://datawarrior.wordpress.com/2015/10/25/codienerd-2-toying-with-word2vec/>`_]
49 | 
50 | "Probabilistic Theory of Word Embeddings: GloVe," *Everything About Data Analytics*, WordPress (2016). [`WordPress
51 | <https://datawarrior.wordpress.com/2016/07/25/probabilistic-theory-of-word-embeddings-glove/>`_]
52 | 
53 | "Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress
54 | <https://datawarrior.wordpress.com/2016/05/15/word-embedding-algorithms/>`_]
55 | 
56 | "Python Package for Short Text Mining," *Everything About Data Analytics*, WordPress (2016). [`WordPress
57 | <https://datawarrior.wordpress.com/2016/12/22/python-package-for-short-text-mining/>`_]
58 | 
59 | "Short Text Mining using Advanced Keras Layers and Maxent: shorttext 0.4.1," *Everything About Data Analytics*, WordPress (2017). [`WordPress
60 | <https://datawarrior.wordpress.com/2017/07/30/short-text-mining-using-advanced-keras-layers-and-maxent-shorttext-0-4-1/>`_]
61 | 
62 | "Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress
63 | <https://datawarrior.wordpress.com/2017/08/16/word-movers-distance-as-a-linear-programming-problem/>`_]
64 | 
65 | "Release of shorttext 0.5.4," *Everything About Data Analytics*, WordPress (2017). [`WordPress
66 | <https://datawarrior.wordpress.com/2017/09/08/release-of-shorttext-0-5-4/>`_]
67 | 
68 | "Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress
69 | <https://datawarrior.wordpress.com/2018/01/22/document-term-matrix-text-mining-in-r-and-python/>`_]
70 | 
71 | "Package shorttext 1.0.0 Released," Medium (2018). [`Medium
72 | <https://medium.com/@stephenhky/package-shorttext-1-0-0-released-ca3cb24d0ff3>`_]
73 | 
74 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/refs.rst:
--------------------------------------------------------------------------------
  1 | References
  2 | ==========
  3 | 
  4 | Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM
  5 | <http://dl.acm.org/citation.cfm?id=234289>`_]
  6 | 
  7 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
  8 | <http://shop.oreilly.com/product/0636920052289.do>`_]
  9 | 
 10 | Chinmaya Pancholi, "Gensim integration with scikit-learn and Keras," *Google Summer of Codes* (GSoC) proposal (2017). [`Github
 11 | <https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md>`_]
 12 | 
 13 | Chinmaya Pancholi, "Chinmaya’s GSoC 2017 Summary: Integration with sklearn & Keras and implementing fastText," *RaRe Incubator* (September 2, 2017). [`RaRe
 14 | <https://rare-technologies.com/chinmayas-gsoc-2017-summary-integration-with-sklearn-keras-and-implementing-fasttext/>`_]
 15 | 
 16 | Christopher Manning, Hinrich Schütze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press
 17 | <https://mitpress.mit.edu/books/foundations-statistical-natural-language-processing>`_]
 18 | 
 19 | Christopher D. Manning, Prabhakar Raghavan, Hinrich Schütze, *Introduction to Information Retrieval* (Cambridge, MA: Cambridge University Press, 2008). [`StanfordNLP
 20 | <http://nlp.stanford.edu/IR-book/>`_]
 21 | 
 22 | Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv
 23 | <https://arxiv.org/abs/1511.08630>`_]
 24 | 
 25 | Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
 26 | <http://ieeexplore.ieee.org/abstract/document/6881904/>`_]
 27 | 
 28 | Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ
 29 | <http://oem.bmj.com/content/73/6/417.long>`_]
 30 | 
 31 | Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare
 32 | <https://www.slideshare.net/DataScienceMD/it-takes-a-village-to-solve-a-problem-in-data-science>`_]
 33 | 
 34 | David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992).
 35 | 
 36 | David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012). [`ACM
 37 | <http://dl.acm.org/citation.cfm?id=2133826>`_]
 38 | 
 39 | Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras
 40 | <https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html>`_]
 41 | 
 42 | Francois Chollet, "Building Autoencoders in Keras," *The Keras Blog*. [`Keras
 43 | <https://blog.keras.io/building-autoencoders-in-keras.html>`_]
 44 | 
 45 | Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU
 46 | <https://www.csie.ntu.edu.tw/~cjlin/libshorttext/>`_]
 47 | 
 48 | Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto
 49 | <http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf>`_]
 50 | 
 51 | Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv
 52 | <https://arxiv.org/abs/1409.3215>`_]
 53 | 
 54 | Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe
 55 | <https://rare-technologies.com/implementing-poincare-embeddings/#h2-2>`_]
 56 | 
 57 | Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF
 58 | <http://www.aclweb.org/anthology/D14-1162>`_]
 59 | 
 60 | Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv
 61 | <https://arxiv.org/abs/1608.02214>`_]
 62 | 
 63 | "Keras 2.0 Release Notes." (2017) [`Github
 64 | <https://github.com/fchollet/keras/wiki/Keras-2.0-release-notes/>`_]
 65 | 
 66 | Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
 67 | 
 68 | Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv
 69 | <https://arxiv.org/abs/1705.08039>`_]
 70 | 
 71 | Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs
 72 | <https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis>`_]
 73 | 
 74 | M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
 75 | *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
 76 | 
 77 | Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv
 78 | <https://arxiv.org/abs/1404.2188>`_]
 79 | 
 80 | Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv
 81 | <https://arxiv.org/abs/1506.05869>`_]
 82 | 
 83 | Peter Norvig, "How to write a spell corrector." (2016) [`Norvig
 84 | <https://norvig.com/spell-correct.html>`_]
 85 | 
 86 | Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv
 87 | <https://arxiv.org/abs/1607.04606>`_]
 88 | 
 89 | Radim Rehurek, Petr Sojka, "Software Framework for Topic Modelling with Large Corpora," In Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks (2010). [`ResearchGate
 90 | <https://www.researchgate.net/publication/255820377_Software_Framework_for_Topic_Modelling_with_Large_Corpora>`_]
 91 | 
 92 | Sebastian Ruder, "An overview of gradient descent optimization algorithms," blog of Sebastian Ruder, arXiv:1609.04747 (2016). [`Ruder
 93 | <http://sebastianruder.com/optimizing-gradient-descent/>`_ or `arXiv
 94 | <https://arxiv.org/abs/1609.04747>`_]
 95 | 
 96 | Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium
 97 | <https://medium.com/@TalPerry/convolutional-methods-for-text-d5260fd5675f>`_]
 98 | 
 99 | Thomas W. Jones, "textmineR: Functions for Text Mining and Topic Modeling," CRAN Project. [`CRAN
100 | <https://cran.r-project.org/web/packages/textmineR/index.html>`_ or `Github
101 | <https://github.com/TommyJones/textmineR>`_]
102 | 
103 | Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv
104 | <https://arxiv.org/abs/1301.3781>`_]
105 | 
106 | Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv
107 | <https://arxiv.org/abs/1708.02709>`_]
108 | 
109 | Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
110 | "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
111 | *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).
112 | 
113 | Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
114 | WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
115 | <http://dl.acm.org/citation.cfm?id=1367510>`_]
116 | 
117 | Yoon Kim, "Convolutional Neural Networks for Sentence Classification," *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv
118 | <https://arxiv.org/abs/1408.5882>`_]
119 | 
120 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv
121 | <https://arxiv.org/abs/1506.00019>`_]
122 | 
123 | 
124 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==2.2.6
 2 | scipy==1.15.3
 3 | joblib==1.5.1
 4 | scikit-learn==1.7.0
 5 | tensorflow==2.19.0
 6 | keras==3.10.0
 7 | gensim==4.3.3
 8 | pandas==2.3.0
 9 | snowballstemmer==3.0.1
10 | transformers==4.52.4
11 | torch==2.7.1
12 | numba==0.61.2
13 | 


--------------------------------------------------------------------------------
/docs/scripts.rst:
--------------------------------------------------------------------------------
 1 | Console Scripts
 2 | ===============
 3 | 
 4 | This package provides two scripts.
 5 | 
 6 | The development of the scripts is *not stable* yet, and more scripts will be added.
 7 | 
 8 | ShortTextCategorizerConsole
 9 | ---------------------------
10 | 
11 | ::
12 | 
13 |     usage: ShortTextCategorizerConsole [-h] [--wv WV] [--vecsize VECSIZE]
14 |                                        [--topn TOPN] [--inputtext INPUTTEXT]
15 |                                        [--type TYPE]
16 |                                        model_filepath
17 | 
18 |     Perform prediction on short text with a given trained model.
19 | 
20 |     positional arguments:
21 |       model_filepath        Path of the trained (compact) model.
22 | 
23 |     options:
24 |       -h, --help            show this help message and exit
25 |       --wv WV               Path of the pre-trained Word2Vec model. (None if not
26 |                             needed)
27 |       --vecsize VECSIZE     Vector dimensions. (Default: 300)
28 |       --topn TOPN           Number of top-scored results displayed. (Default: 10)
29 |       --inputtext INPUTTEXT
30 |                             single input text for classification. Run console if
31 |                             set to None. (Default: None)
32 |       --type TYPE           Type of word-embedding model (default: "word2vec";
33 |                             other options: "fasttext", "poincare",
34 |                             "word2vec_nonbinary", "poincare_binary")
35 | 
36 | 
37 | ShortTextWordEmbedSimilarity
38 | ----------------------------
39 | 
40 | ::
41 | 
42 |     usage: ShortTextWordEmbedSimilarity [-h] [--type TYPE] modelpath
43 | 
44 |     Find the similarities between two short sentences using Word2Vec.
45 | 
46 |     positional arguments:
47 |       modelpath    Path of the Word2Vec model
48 | 
49 |     optional arguments:
50 |       -h, --help   show this help message and exit
51 |       --type TYPE  Type of word-embedding model (default: "word2vec"; other
52 |                    options: "fasttext", "poincare")
53 | 
54 | 
55 | Home: :doc:`index`
56 | 


--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
 1 | Tutorial
 2 | ========
 3 | 
 4 | After installation, you are ready to start testing the convenience and power
 5 | of the package.
 6 | 
 7 | Before using, type
 8 | 
 9 | >>> import shorttext
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    tutorial_dataprep
15 |    tutorial_textpreprocessing
16 |    tutorial_dtm
17 |    tutorial_charbaseonehot
18 |    tutorial_topic
19 |    tutorial_wordembed
20 |    tutorial_sumvec
21 |    tutorial_nnlib
22 |    tutorial_maxent
23 |    tutorial_charbaseseq2seq
24 |    tutorial_stacking
25 |    tutorial_metrics
26 |    tutorial_spell
27 | 
28 | 
29 | Home: :doc:`index`
30 | 


--------------------------------------------------------------------------------
/docs/tutorial_charbaseonehot.rst:
--------------------------------------------------------------------------------
 1 | Character to One-Hot Vector
 2 | ===========================
 3 | 
 4 | Since version 0.6.1, the package `shorttext` deals with character-based model. A first important
 5 | component of character-based model is to convert every character to a one-hot vector. We provide a class
 6 | :class:`shorttext.generators.SentenceToCharVecEncoder` to deal with this. Thi class incorporates
 7 | the `OneHotEncoder` in `scikit-learn` and `Dictionary` in `gensim`.
 8 | 
 9 | To use this, import the packages first:
10 | 
11 | >>> import numpy as np
12 | >>> import shorttext
13 | 
14 | Then we incorporate a text file as the source of all characters to be coded. In this case, we choose
15 | the file `big.txt` in Peter Norvig's websites:
16 | 
17 | >>> from urllib.request import urlopen
18 | >>> textfile = urlopen('http://norvig.com/big.txt', 'r')
19 | 
20 | Then instantiate the class using the function :func:`shorttext.generators.initSentenceToCharVecEncoder`:
21 | 
22 | >>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(textfile)
23 | 
24 | Now, the object `chartovec_encoder` is an instance of :class:`shorttext.generators.SentenceToCharVecEncoder` . The
25 | default signal character is `\n`, which is also encoded, and can be checked by looking at the field:
26 | 
27 | >>> chartovec_encoder.signalchar
28 | 
29 | We can convert a sentence into a bunch of one-hot vectors in terms of a matrix. For example,
30 | 
31 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100)
32 | <1x93 sparse matrix of type '<type 'numpy.float64'>'
33 | 	with 1 stored elements in Compressed Sparse Column format>
34 | 
35 | This outputs a sparse matrix. Depending on your needs, you can add signal character to the beginning
36 | or the end of the sentence in the output matrix by:
37 | 
38 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=True, endsig=False)
39 | >>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=False, endsig=True)
40 | 
41 | We can also convert a list of sentences by
42 | 
43 | >>> chartovec_encoder.encode_sentences(sentences, 100, startsig=False, endsig=True, sparse=False)
44 | 
45 | You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`.
46 | 
47 | 
48 | .. automodule:: shorttext.generators.charbase.char2vec
49 |    :members:
50 | 
51 | 
52 | Reference
53 | ---------
54 | 
55 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
56 | <http://shop.oreilly.com/product/0636920052289.do>`_]
57 | 
58 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_charbaseseq2seq.rst:
--------------------------------------------------------------------------------
 1 | Character-Based Sequence-to-Sequence (seq2seq) Models
 2 | =====================================================
 3 | 
 4 | Since release 0.6.0, `shorttext` supports sequence-to-sequence (seq2seq) learning. While there is a general seq2seq class
 5 | behind, it provides a character-based seq2seq implementation.
 6 | 
 7 | Creating One-hot Vectors
 8 | ------------------------
 9 | 
10 | To use it, create an instance of the class :class:`shorttext.generators.SentenceToCharVecEncoder`:
11 | 
12 | >>> import numpy as np
13 | >>> import shorttext
14 | >>> from urllib.request import urlopen
15 | >>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r'))
16 | 
17 | The above code is the same as :doc:`tutorial_charbaseonehot` .
18 | 
19 | .. automodule:: shorttext.generators.charbase.char2vec
20 |    :members: initSentenceToCharVecEncoder
21 | 
22 | 
23 | Training
24 | --------
25 | 
26 | Then we can train the model by creating an instance of :class:`shorttext.generators.CharBasedSeq2SeqGenerator`:
27 | 
28 | >>> latent_dim = 100
29 | >>> seq2seqer = shorttext.generators.CharBasedSeq2SeqGenerator(chartovec_encoder, latent_dim, 120)
30 | 
31 | And then train this neural network model:
32 | 
33 | >>> seq2seqer.train(text, epochs=100)
34 | 
35 | This model takes several hours to train on a laptop.
36 | 
37 | 
38 | .. autoclass:: shorttext.generators.seq2seq.charbaseS2S.CharBasedSeq2SeqGenerator
39 |    :members:
40 | 
41 | Decoding
42 | --------
43 | 
44 | After training, we can use this class as a generative model
45 | of answering questions as a chatbot:
46 | 
47 | >>> seq2seqer.decode('Happy Holiday!')
48 | 
49 | It does not give definite answers because there is a stochasticity in the prediction.
50 | 
51 | Model I/O
52 | ---------
53 | 
54 | This model can be saved by entering:
55 | 
56 | >>> seq2seqer.save_compact_model('/path/to/norvigtxt_iter5model.bin')
57 | 
58 | And can be loaded by:
59 | 
60 | >>> seq2seqer2 = shorttext.generators.seq2seq.charbaseS2S.loadCharBasedSeq2SeqGenerator('/path/to/norvigtxt_iter5model.bin')
61 | 
62 | .. automodule:: shorttext.generators.seq2seq.charbaseS2S
63 |    :members: loadCharBasedSeq2SeqGenerator
64 | 
65 | 
66 | Reference
67 | ---------
68 | 
69 | Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
70 | <http://shop.oreilly.com/product/0636920052289.do>`_]
71 | 
72 | Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto
73 | <http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf>`_]
74 | 
75 | Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv
76 | <https://arxiv.org/abs/1409.3215>`_]
77 | 
78 | Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv
79 | <https://arxiv.org/abs/1506.05869>`_]
80 | 
81 | Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv
82 | <https://arxiv.org/abs/1708.02709>`_]
83 | 
84 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv
85 | <https://arxiv.org/abs/1506.00019>`_]
86 | 
87 | 


--------------------------------------------------------------------------------
/docs/tutorial_dataprep.rst:
--------------------------------------------------------------------------------
  1 | Data Preparation
  2 | ================
  3 | 
  4 | This package deals with short text. While the text data for predictions or
  5 | classifications are simply `str` or list of `str`, the training data does
  6 | take a specific format, in terms of `dict`, the Python dictionary (or hash
  7 | map). The package provides two sets of data as an example.
  8 | 
  9 | Example Training Data 1: Subject Keywords
 10 | -----------------------------------------
 11 | 
 12 | The first example dataset is about the subject keywords, which can be loaded by:
 13 | 
 14 | >>> trainclassdict = shorttext.data.subjectkeywords()
 15 | 
 16 | This returns a dictionary, with keys being the label and the values being lists of
 17 | the subject keywords, as below:
 18 | 
 19 | ::
 20 | 
 21 |     {'mathematics': ['linear algebra', 'topology', 'algebra', 'calculus',
 22 |       'variational calculus', 'functional field', 'real analysis', 'complex analysis',
 23 |       'differential equation', 'statistics', 'statistical optimization', 'probability',
 24 |       'stochastic calculus', 'numerical analysis', 'differential geometry'],
 25 |      'physics': ['renormalization', 'classical mechanics', 'quantum mechanics',
 26 |       'statistical mechanics', 'functional field', 'path integral',
 27 |       'quantum field theory', 'electrodynamics', 'condensed matter',
 28 |       'particle physics', 'topological solitons', 'astrophysics',
 29 |       'spontaneous symmetry breaking', 'atomic molecular and optical physics',
 30 |       'quantum chaos'],
 31 |      'theology': ['divine providence', 'soteriology', 'anthropology', 'pneumatology', 'Christology',
 32 |       'Holy Trinity', 'eschatology', 'scripture', 'ecclesiology', 'predestination',
 33 |       'divine degree', 'creedal confessionalism', 'scholasticism', 'prayer', 'eucharist']}
 34 | 
 35 | 
 36 | .. automodule:: shorttext.data.data_retrieval
 37 |     :members: subjectkeywords
 38 | 
 39 | Example Training Data 2: NIH RePORT
 40 | -----------------------------------
 41 | 
 42 | The second example dataset is from NIH RePORT (Research Portfolio Online Reporting Tools).
 43 | The data can be downloaded from its `ExPORTER
 44 | <https://exporter.nih.gov/about.aspx>`_ page. The current data in this package was directly
 45 | adapted from Thomas Jones' `textMineR
 46 | <https://github.com/TommyJones/textmineR>`_ R package.
 47 | 
 48 | Enter:
 49 | 
 50 | >>> trainclassdict = shorttext.data.nihreports()
 51 | 
 52 | Upon the installation of the package, the NIH RePORT data are still not
 53 | installed. But the first time it was ran, it will be downloaded from the Internet.
 54 | 
 55 | This will output a similar dictionary with FUNDING_IC (Institutes and Centers in NIH)
 56 |  as the class labels, and PROJECT_TITLE (title of the funded projects)
 57 | as the short texts under the corresponding labels. This dictionary has 512 projects in total,
 58 | randomly drawn from the original data.
 59 | 
 60 | However, there are other configurations:
 61 | 
 62 | .. automodule:: shorttext.data.data_retrieval
 63 |     :members: nihreports
 64 | 
 65 | 
 66 | Example Training Data 3: Inaugural Addresses
 67 | --------------------------------------------
 68 | 
 69 | This contains all the Inaugural Addresses of all the Presidents of the United States, from
 70 | George Washington to Barack Obama. Upon the installation of the package, the Inaugural Addresses
 71 | data are still not installed. But the first time it was ran, it will be downloaded from the Internet.
 72 | 
 73 | The addresses are available publicly, and I extracted them from `nltk
 74 | <http://www.nltk.org/>`_ package.
 75 | 
 76 | Enter:
 77 | 
 78 | >>> trainclassdict = shorttext.data.inaugural()
 79 | 
 80 | .. automodule:: shorttext.data.data_retrieval
 81 |     :members: inaugural
 82 | 
 83 | 
 84 | User-Provided Training Data
 85 | ---------------------------
 86 | 
 87 | Users can provide their own training data. If it is already in JSON format, it can be loaded easily
 88 | with standard Python's `json` package, or by calling:
 89 | 
 90 | >>> trainclassdict = shorttext.data.retrieve_jsondata_as_dict('/path/to/file.json')
 91 | 
 92 | However, if it is in CSV format, it has to obey the rules:
 93 | 
 94 | - there is a heading; and
 95 | - there are at least two columns: first the labels, and second the short text under the labels (everything being the second column will be neglected).
 96 | 
 97 | An excerpt of this type of data is as follow:
 98 | 
 99 | ::
100 | 
101 |     subject,content
102 |     mathematics,linear algebra
103 |     mathematics,topology
104 |     mathematics,algebra
105 |     ...
106 |     physics,spontaneous symmetry breaking
107 |     physics,atomic molecular and optical physics
108 |     physics,quantum chaos
109 |     ...
110 |     theology,divine providence
111 |     theology,soteriology
112 |     theology,anthropology
113 | 
114 | To load this data file, just enter:
115 | 
116 | >>> trainclassdict = shorttext.data.retrieve_csvdata_as_dict('/path/to/file.csv')
117 | 
118 | .. automodule:: shorttext.data.data_retrieval
119 |     :members: retrieve_csvdata_as_dict
120 | 
121 | 
122 | Home: :doc:`index`
123 | 


--------------------------------------------------------------------------------
/docs/tutorial_dtm.rst:
--------------------------------------------------------------------------------
 1 | Document-Term Matrix
 2 | ====================
 3 | 
 4 | Preparing for the Corpus
 5 | ------------------------
 6 | 
 7 | We can create and handle document-term matrix (DTM) with `shorttext`. Use the dataset of Presidents'
 8 | Inaugural Addresses as an example.
 9 | 
10 | >>> import shorttext
11 | >>> usprez = shorttext.data.inaugural()
12 | 
13 | We have to make each presidents' address to be one document to achieve our purpose. Enter this:
14 | 
15 | >>> docids = sorted(usprez.keys())
16 | >>> usprez = [' '.join(usprez[docid]) for docid in docids]
17 | 
18 | Now the variable `usprez` is a list of 56 Inaugural Addresses from George Washington (1789) to
19 | Barack Obama (2009), with the IDs stored in `docids`. We apply the standard text preprocessor and
20 | produce a list of lists (of tokens) (or a corpus in `gensim`):
21 | 
22 | >>> preprocess = shorttext.utils.standard_text_preprocessor_1()
23 | >>> corpus = [preprocess(address).split(' ') for address in usprez]
24 | 
25 | Then now the variable `corpus` is a list of lists of tokens. For example,
26 | 
27 | >>> corpus[0]     # shows all the preprocessed tokens of the first Presidential Inaugural Addresses
28 | 
29 | Using Class `DocumentTermMatrix`
30 | --------------------------------
31 | 
32 | With the corpus ready in this form, we can create a `DocumentTermMatrix` class for DTM by:
33 | 
34 | >>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids)
35 | 
36 | .. autoclass:: shorttext.utils.dtm.DocumentTermMatrix
37 |    :members:
38 | 
39 | One can get the document frequency of any token (the number of documents that the given
40 | token is in) by:
41 | 
42 | >>> usprez_dtm.get_doc_frequency('peopl')  # gives 54, the document frequency of the token "peopl"
43 | 
44 | or the total term frequencies (the total number of occurrences of the given tokens in all documents) by:
45 | 
46 | >>> usprez_dtm.get_total_termfreq('justic')   # gives 134.0, the total term frequency of the token "justic"
47 | 
48 | or the term frequency for a token in a given document by:
49 | 
50 | >>> usprez_dtm.get_termfreq('2009-Obama', 'chang')    # gives 2.0
51 | 
52 | We can also query the number of occurrences of a particular word of all documents,
53 | stored in a dictionary, by:
54 | 
55 | >>> usprez_dtm.get_token_occurences('god')
56 | 
57 | Of course, we can always reweigh the counts above (except document frequency) by imposing
58 | tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`:
59 | 
60 | >>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
61 | 
62 | To save the class, enter:
63 | 
64 | >>> usprez_dtm.save_compact_model('/path/to/whatever.bin')
65 | 
66 | To load this class later, enter:
67 | 
68 | >>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin')
69 | 
70 | .. automodule:: shorttext.utils.dtm
71 |    :members: load_DocumentTermMatrix
72 | 
73 | Reference
74 | ---------
75 | 
76 | Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press
77 | <https://mitpress.mit.edu/books/foundations-statistical-natural-language-processing>`_]
78 | 
79 | "Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress
80 | <https://datawarrior.wordpress.com/2018/01/22/document-term-matrix-text-mining-in-r-and-python/>`_]
81 | 
82 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_maxent.rst:
--------------------------------------------------------------------------------
 1 | Maximum Entropy (MaxEnt) Classifier
 2 | ===================================
 3 | 
 4 | Maxent
 5 | ------
 6 | 
 7 | Maximum entropy (maxent) classifier has been a popular text classifier, by parameterizing the model
 8 | to achieve maximum categorical entropy, with the constraint that the resulting probability
 9 | on the training data with the model being equal to the real distribution.
10 | 
11 | The maxent classifier in `shorttext` is impleneted by `keras`. The optimization algorithm is
12 | defaulted to be the Adam optimizer, although other gradient-based or momentum-based optimizers
13 | can be used. The traditional methods such as generative iterative scaling (GIS) or
14 | L-BFGS cannot be used here.
15 | 
16 | To use the maxent classifier, import the package:
17 | 
18 | >>> import shorttext
19 | >>> from shorttext.classifiers import MaxEntClassifier
20 | 
21 | Loading NIH reports as an example:
22 | 
23 | >>> classdict = shorttext.data.nihreports()
24 | 
25 | The classifier can be instantiated by:
26 | 
27 | >>> classifier = MaxEntClassifier()
28 | 
29 | Train the classifier:
30 | 
31 | >>> classifier.train(classdict, nb_epochs=1000)
32 | 
33 | After training, it can be used for classification, such as
34 | 
35 | >>> classifier.score('cancer immunology')   # NCI tops the score
36 | >>> classifier.score('children health')     # NIAID tops the score
37 | >>> classifier.score('Alzheimer disease and aging')    # NIAID tops the score
38 | 
39 | To save the model,
40 | 
41 | >>> classifier.save_compact_model('/path/to/filename.bin')
42 | 
43 | To load the model to be a classifier, enter:
44 | 
45 | >>> classifier2 = shorttext.classifiers.load_maxent_classifier('/path/to/filename.bin')
46 | 
47 | 
48 | .. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification
49 |    :members:
50 | 
51 | 
52 | Reference
53 | ---------
54 | 
55 | Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM
56 | <http://dl.acm.org/citation.cfm?id=234289>`_]
57 | 
58 | Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ
59 | <http://oem.bmj.com/content/73/6/417.long>`_]
60 | 
61 | Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare
62 | <https://www.slideshare.net/DataScienceMD/it-takes-a-village-to-solve-a-problem-in-data-science>`_]
63 | 
64 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_metrics.rst:
--------------------------------------------------------------------------------
  1 | Metrics
  2 | =======
  3 | 
  4 | The package `shorttext` provides a few metrics that measure the distances of some kind. They are all
  5 | under :module:`shorttext.metrics`. The soft Jaccard score is based on spellings, and the Word Mover's
  6 | distance (WMD) embedded word vectors.
  7 | 
  8 | Edit Distance and Soft Jaccard Score
  9 | ------------------------------------
 10 | 
 11 | Edit distance, or Damerau-Levenshtein distance, measures the differences
 12 | between two words due to insertion, deletion, transposition, substitution etc.
 13 | Each of this change causes a distance of 1. The algorithm was written in C.
 14 | 
 15 | First import the package:
 16 | 
 17 | >>> from shorttext.metrics.dynprog.dldist import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
 18 | >>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
 19 | >>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
 20 | 
 21 | The distance can be calculated by:
 22 | 
 23 | >>> damerau_levenshtein('diver', 'driver')        # insertion, gives 1
 24 | >>> damerau_levenshtein('driver', 'diver')        # deletion, gives 1
 25 | >>> damerau_levenshtein('topology', 'tooplogy')   # transposition, gives 1
 26 | >>> damerau_levenshtein('book', 'blok')           # subsitution, gives 1
 27 | 
 28 | The longest common prefix finds the length of common prefix:
 29 | 
 30 | >>> longest_common_prefix('topology', 'topological')    # gives 7
 31 | >>> longest_common_prefix('police', 'policewoman')      # gives 6
 32 | 
 33 | The similarity between words is defined as the larger of the following:
 34 | 
 35 | :math:`s = 1 - \frac{\text{DL distance}}{\max( \text(len(word1)), \text(len(word2)) )}`
 36 | and
 37 | :math:`s = \frac{\text{longest common prefix}}{\max( \text(len(word1)), \text(len(word2)) )}`
 38 | 
 39 | >>> similarity('topology', 'topological')    # gives 0.6363636363636364
 40 | >>> similarity('book', 'blok')               # gives 0.75
 41 | 
 42 | Given the similarity, we say that the intersection, for example, between 'book' and 'blok', has 0.75 elements, or the
 43 | union has 1.25 elements. Then the similarity between two sets of tokens can be measured using Jaccard index, with this
 44 | "soft" numbers of intersection. Therefore,
 45 | 
 46 | >>> soft_jaccard_score(['book', 'seller'], ['blok', 'sellers'])     # gives 0.6716417910447762
 47 | >>> soft_jaccard_score(['police', 'station'], ['policeman'])        # gives 0.2857142857142858
 48 | 
 49 | The functions `damerau_levenshtein` and `longest_common_prefix` are implemented using Cython_ .
 50 | (Before release 0.7.2, they were interfaced to Python using SWIG_ (Simplified Wrapper and Interface Generator)).
 51 | 
 52 | 
 53 | .. automodule:: shorttext.metrics.dynprog.jaccard
 54 |    :members: similarity, soft_jaccard_score
 55 | 
 56 | 
 57 | Word Mover's Distance
 58 | ---------------------
 59 | 
 60 | Unlike soft Jaccard score that bases similarity on the words' spellings, Word Mover's distance (WMD)
 61 | the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
 62 | distance. The calculation of WMD in this package is based on linear programming, and the distance between
 63 | words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
 64 | 
 65 | Import the modules, and load the word-embedding models:
 66 | 
 67 | >>> from shorttext.metrics.wasserstein import word_mover_distance
 68 | >>> from shorttext.utils import load_word2vec_model
 69 | >>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
 70 | 
 71 | Examples:
 72 | 
 73 | >>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel)                      # gives 3.060708999633789
 74 | >>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel)      # gives 2.276337146759033
 75 | 
 76 | More examples can be found in this `IPython Notebook
 77 | <https://github.com/stephenhky/PyWMD/blob/master/WordMoverDistanceDemo.ipynb>`_ .
 78 | 
 79 | In `gensim`, the Word2Vec model allows the calculation of WMD if user installed the package PyEMD_. It is based on the
 80 | scale invariant feature transform (SIFT), an algorithm for EMD based on L1-distance (Manhattan distance).
 81 | For more details,
 82 | please refer to their `tutorial
 83 | <https://radimrehurek.com/gensim/models/keyedvectors.html>`_ , and cite the two papers by Ofir Pele and Michael Werman
 84 | if it is used.
 85 | 
 86 | .. automodule:: shorttext.metrics.wasserstein.wordmoverdist
 87 |    :members: word_mover_distance
 88 | 
 89 | Jaccard Index Due to Cosine Distances
 90 | -------------------------------------
 91 | 
 92 | In the above section of edit distance, the Jaccard score was calculated by considering soft membership
 93 | using spelling. However, we can also compute the soft membership by cosine similarity with
 94 | 
 95 | >>> from shorttext.utils import load_word2vec_model
 96 | >>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
 97 | >>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
 98 | 
 99 | For example, the number of words between the set containing 'doctor' and that containing 'physician'
100 | is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
101 | 
102 | :math:`0.78060223420956831 / (2-0.78060223420956831) = 0.6401538990056869`
103 | 
104 | And it can be seen by running it:
105 | 
106 | >>> jaccardscore_sents('doctor', 'physician', wvmodel)   # gives 0.6401538990056869
107 | >>> jaccardscore_sents('chief executive', 'computer cluster', wvmodel)   # gives 0.0022515450768836143
108 | >>> jaccardscore_sents('topological data', 'data of topology', wvmodel)   # gives 0.67588977344632573
109 | 
110 | .. automodule:: shorttext.metrics.embedfuzzy.jaccard
111 |    :members:
112 | 
113 | 
114 | BERTScore
115 | ---------
116 | 
117 | BERTScore includes a category of metrics that is based on BERT model.
118 | This metrics measures the similarity between sentences. To use it,
119 | 
120 | >>> from shorttext.metrics.transformers import BERTScorer
121 | >>> scorer = BERTScorer()    # using default BERT model and tokenizer
122 | >>> scorer.recall_bertscore('The weather is cold.', 'It is freezing.')   # 0.7223385572433472
123 | >>> scorer.precision_bertscore('The weather is cold.', 'It is freezing.')   # 0.7700849175453186
124 | >>> scorer.f1score_bertscore('The weather is cold.', 'It is freezing.')   # 0.7454479746418043
125 | 
126 | For BERT models, please refer to :doc:`tutorial_wordembed` for more details.
127 | 
128 | .. automodule:: shorttext.metrics.transformers.bertscore
129 |    :members:
130 | 
131 | Reference
132 | ---------
133 | 
134 | "Damerau-Levenshtein Distance." [`Wikipedia
135 | <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>`_]
136 | 
137 | "Jaccard index." [`Wikipedia
138 | <https://en.wikipedia.org/wiki/Jaccard_index>`_]
139 | 
140 | Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
141 | <http://ieeexplore.ieee.org/abstract/document/6881904/>`_]
142 | 
143 | Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
144 | 
145 | Ofir Pele, Michael Werman, "A linear time histogram metric for improved SIFT matching," *Computer Vision - ECCV 2008*, 495-508 (2008). [`ACM
146 | <http://dl.acm.org/citation.cfm?id=1478212>`_]
147 | 
148 | Ofir Pele, Michael Werman, "Fast and robust earth mover's distances," *Proc. 2009 IEEE 12th Int. Conf. on Computer Vision*, 460-467 (2009). [`IEEE
149 | <http://ieeexplore.ieee.org/document/5459199/>`_]
150 | 
151 | Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, Yoav Artzi,
152 | "BERTScore: Evaluating Text Generation with BERT," arXiv:1904.09675 (2019). [`arXiv
153 | <https://arxiv.org/abs/1904.09675>`_]
154 | 
155 | "Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress
156 | <https://datawarrior.wordpress.com/2017/08/16/word-movers-distance-as-a-linear-programming-problem/>`_]
157 | 
158 | 
159 | Home: :doc:`index`
160 | 
161 | .. _SWIG: http://www.swig.org/
162 | .. _PyEMD: https://github.com/wmayner/pyemd
163 | .. _Cython: http://cython.org/


--------------------------------------------------------------------------------
/docs/tutorial_nnlib.rst:
--------------------------------------------------------------------------------
  1 | Deep Neural Networks with Word-Embedding
  2 | ========================================
  3 | 
  4 | Wrapper for Neural Networks for Word-Embedding Vectors
  5 | ------------------------------------------------------
  6 | 
  7 | In this package, there is a class that serves a wrapper for various neural network algorithms
  8 | for supervised short text categorization:
  9 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`.
 10 | Each class label has a few short sentences, where each token is converted
 11 | to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model).
 12 | The sentences are represented by a matrix, or rank-2 array.
 13 | The type of neural network has to be passed when training, and it has to be of
 14 | type :class:`keras.models.Sequential`. The number of outputs of the models has to match
 15 | the number of class labels in the training data.
 16 | To perform prediction, the input short sentences is converted to a unit vector
 17 | in the same way. The score is calculated according to the trained neural network model.
 18 | 
 19 | Some of the neural networks can be found within the module :module:`shorttext.classifiers.embed.nnlib.frameworks`
 20 | and they are good for short text or document classification. Of course, users can supply their
 21 | own neural networks, written in `keras`.
 22 | 
 23 | A pre-trained Google Word2Vec model can be downloaded `here
 24 | <https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_,
 25 | and a pre-trained Facebook FastText model can be downloaded `here
 26 | <https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md>`_.
 27 | 
 28 | 
 29 | See: :doc:`tutorial_wordembed` .
 30 | 
 31 | Import the package:
 32 | 
 33 | >>> import shorttext
 34 | 
 35 | To load the Word2Vec model,
 36 | 
 37 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
 38 | 
 39 | Then load the training data
 40 | 
 41 | >>> trainclassdict = shorttext.data.subjectkeywords()
 42 | 
 43 | Then we choose a neural network. We choose ConvNet:
 44 | 
 45 | >>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300)
 46 | 
 47 | Initialize the classifier:
 48 | 
 49 | >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
 50 | 
 51 | .. autoclass:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification.VarNNEmbeddedVecClassifier
 52 |    :members:
 53 | 
 54 | 
 55 | Then train the classifier:
 56 | 
 57 | >>> classifier.train(trainclassdict, kmodel)
 58 | Epoch 1/10
 59 | 45/45 [==============================] - 0s - loss: 1.0578
 60 | Epoch 2/10
 61 | 45/45 [==============================] - 0s - loss: 0.5536
 62 | Epoch 3/10
 63 | 45/45 [==============================] - 0s - loss: 0.3437
 64 | Epoch 4/10
 65 | 45/45 [==============================] - 0s - loss: 0.2282
 66 | Epoch 5/10
 67 | 45/45 [==============================] - 0s - loss: 0.1658
 68 | Epoch 6/10
 69 | 45/45 [==============================] - 0s - loss: 0.1273
 70 | Epoch 7/10
 71 | 45/45 [==============================] - 0s - loss: 0.1052
 72 | Epoch 8/10
 73 | 45/45 [==============================] - 0s - loss: 0.0961
 74 | Epoch 9/10
 75 | 45/45 [==============================] - 0s - loss: 0.0839
 76 | Epoch 10/10
 77 | 45/45 [==============================] - 0s - loss: 0.0743
 78 | 
 79 | Then the model is ready for classification, like:
 80 | 
 81 | >>> classifier.score('artificial intelligence')
 82 | {'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325}
 83 | 
 84 | The trained model can be saved:
 85 | 
 86 | >>> classifier.save_compact_model('/path/to/nnlibvec_convnet_subdata.bin')
 87 | 
 88 | To load it, enter:
 89 | 
 90 | >>> classifier2 = shorttext.classifiers.load_varnnlibvec_classifier(wvmodel, '/path/to/nnlibvec_convnet_subdata.bin')
 91 | 
 92 | .. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification
 93 |    :members: load_varnnlibvec_classifier
 94 | 
 95 | 
 96 | Provided Neural Networks
 97 | ------------------------
 98 | 
 99 | There are three neural networks available in this package for the use in
100 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`,
101 | and they are available in the module `shorttext.classifiers.frameworks`.
102 | 
103 | .. automodule:: shorttext.classifiers.embed.nnlib.frameworks
104 |    :members:
105 | 
106 | 
107 | ConvNet (Convolutional Neural Network)
108 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
109 | 
110 | This neural network for supervised learning is using convolutional neural network (ConvNet),
111 | as demonstrated in Kim's paper.
112 | 
113 | .. image:: images/nnlib_cnn.png
114 | 
115 | The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are:
116 | 
117 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
118 | words, then the empty words will be filled with zero vectors.
119 | 
120 | >>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
121 | 
122 | Double ConvNet
123 | ^^^^^^^^^^^^^^
124 | 
125 | This neural network is nothing more than two ConvNet layers. The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are:
126 | 
127 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
128 | words, then the empty words will be filled with zero vectors.
129 | 
130 | >>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
131 | 
132 | C-LSTM (Convolutional Long Short-Term Memory)
133 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
134 | 
135 | This neural network for supervised learning is using C-LSTM, according to the paper
136 | written by Zhou *et. al.* It is a neural network with ConvNet as the first layer,
137 | and then followed by LSTM (long short-term memory), a type of recurrent neural network (RNN).
138 | 
139 | .. image:: images/nnlib_clstm.png
140 | 
141 | The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`.
142 | 
143 | The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
144 | words, then the empty words will be filled with zero vectors.
145 | 
146 | >>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
147 | 
148 | User-Defined Neural Network
149 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
150 | 
151 | Users can define their own neural network for use in the classifier wrapped by
152 | :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`
153 | as long as the following criteria are met:
154 | 
155 | - the input matrix is :class:`numpy.ndarray`, and of shape `(maxlen, vecsize)`, where
156 | `maxlen` is the maximum length of the sentence, and `vecsize` is the number of dimensions
157 | of the embedded vectors. The output is a one-dimensional array, of size equal to
158 | the number of classes provided by the training data. The order of the class labels is assumed
159 | to be the same as the order of the given training data (stored as a Python dictionary).
160 | 
161 | Putting Word2Vec Model As an Input Keras Layer (Deprecated)
162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
163 | 
164 | This functionality is removed since release 0.5.11, due to the following reasons:
165 | 
166 | * `keras` changed its code that produces this bug;
167 | * the layer is consuming memory;
168 | * only Word2Vec is supported; and
169 | * the results are incorrect.
170 | 
171 | Reference
172 | ---------
173 | 
174 | Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv
175 | <https://arxiv.org/abs/1511.08630>`_]
176 | 
177 | "CS231n Convolutional Neural Networks for Visual Recognition," Stanford Online Course. [`link
178 | <http://cs231n.github.io/convolutional-networks/>`_]
179 | 
180 | Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv
181 | <https://arxiv.org/abs/1404.2188>`_]
182 | 
183 | Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium
184 | <https://medium.com/@TalPerry/convolutional-methods-for-text-d5260fd5675f>`_]
185 | 
186 | Yoon Kim, "Convolutional Neural Networks for Sentence Classification," *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv
187 | <https://arxiv.org/abs/1408.5882>`_]
188 | 
189 | Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv
190 | <https://arxiv.org/abs/1506.00019>`_]
191 | 
192 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_spell.rst:
--------------------------------------------------------------------------------
 1 | Spell Correctors
 2 | ================
 3 | 
 4 | This package supports the use of spell correctors, because typos are very common in relatively short text data.
 5 | 
 6 | There are two types of spell correctors provided: the one described by Peter Norvig (using n-grams Bayesian method),
 7 | and another by Keisuke Sakaguchi and his colleagues (using semi-character level recurrent neural network).
 8 | 
 9 | >>> import shorttext
10 | 
11 | We use the Norvig's training corpus as an example. To load it,
12 | 
13 | >>> from urllib.request import urlopen
14 | >>> text = urlopen('https://norvig.com/big.txt').read()
15 | 
16 | The developer just has to instantiate the spell corrector, and then train it with a corpus to get a correction model.
17 | Then one can use it for correction.
18 | 
19 | Norvig
20 | ------
21 | 
22 | Peter Norvig described a spell corrector based on Bayesian approach and edit distance. You can refer to his blog for
23 | more information.
24 | 
25 | >>> norvig_corrector = shorttext.spell.NorvigSpellCorrector()
26 | >>> norvig_corrector.train(text)
27 | >>> norvig_corrector.correct('oranhe')   # gives "orange"
28 | 
29 | .. automodule:: shorttext.spell.norvig
30 |    :members:
31 | 
32 | 
33 | 
34 | Sakaguchi (SCRNN - semi-character recurrent neural network)
35 | -----------------------------------------------------------
36 | 
37 | Keisuke Sakaguchi and his colleagues developed this spell corrector with the insight that most of the typos happen
38 | in between the spellings. They developed a recurrent neural network that trains possible change within the spellings. There are
39 | six modes:
40 | 
41 | - JUMBLE-WHOLE
42 | - JUMBLE-BEG
43 | - JUMBLE-END
44 | - JUMBLE-INT
45 | - NOISE-INSERT
46 | - NOISE-DELETE
47 | - NOISE-REPLACE
48 | 
49 | The original intent of their work was not to invent a new spell corrector but to study the "Cmabrigde Uinervtisy" effect,
50 | but it is nice to see how it can be implemented as a spell corrector.
51 | 
52 | >>> scrnn_corrector = shorttext.spell.SCRNNSpellCorrector('JUMBLE-WHOLE')
53 | >>> scrnn_corrector.train(text)
54 | >>> scrnn_corrector.correct('oranhe')   # gives "orange"
55 | 
56 | We can persist the SCRNN corrector for future use:
57 | 
58 | >>> scrnn_corrector.save_compact_model('/path/to/spellscrnn.bin')
59 | 
60 | To load,
61 | 
62 | >>> corrector = shorttext.spell.loadSCRNNSpellCorrector('/path/to/spellscrnn.bin')
63 | 
64 | .. automodule:: shorttext.spell.sakaguchi
65 |    :members:
66 | 
67 | 
68 | Reference
69 | ---------
70 | 
71 | Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv
72 | <https://arxiv.org/abs/1608.02214>`_]
73 | 
74 | Peter Norvig, "How to write a spell corrector." (2016) [`Norvig
75 | <https://norvig.com/spell-correct.html>`_]
76 | 


--------------------------------------------------------------------------------
/docs/tutorial_stacking.rst:
--------------------------------------------------------------------------------
 1 | Stacked Generalization
 2 | ======================
 3 | 
 4 | "Stacking generates the members of the stacking ensemble using several learning algorithms and subsequently
 5 | uses another algorithm to learn how to combine their outputs." It combines the classification results
 6 | of several classifiers, and combines them.
 7 | 
 8 | Stacking is most commonly implemented using logistic regression.
 9 | Suppose there are *K* classifiers, and *l* output labels. Then the stacking generalization
10 | is this logistic model:
11 | 
12 | :math:`P ( y=c | x) = \frac{1}{\exp\left( - \sum_{k=1}^{K} w_{kc} x_{kc} + b_c \right) + 1}`
13 | 
14 | Here we demonstrate the use of stacking of two classifiers.
15 | 
16 | Import the package, and employ the subject dataset as the training dataset.
17 | 
18 | >>> import shorttext
19 | >>> subdict = shorttext.data.subjectkeywords()
20 | 
21 | Train a C-LSTM model.
22 | 
23 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
24 | >>> clstm_nnet = shorttext.classifiers.frameworks.CLSTMWordEmbed(len(subdict))
25 | >>> clstm_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
26 | >>> clstm_classifier.train(subdict, clstm_nnet)
27 | 
28 | A test of its classification:
29 | 
30 | >>> clstm_classifier.score('linear algebra')
31 | {'mathematics': 1.0, 'physics': 3.3643366e-10, 'theology': 1.0713742e-13}
32 | >>> clstm_classifier.score('topological soliton')
33 | {'mathematics': 2.0036438e-11, 'physics': 1.0, 'theology': 4.4903334e-14}
34 | 
35 | And we train an SVM, with topic vectors as the input vectors. The topic model is LDA with 128 topics.
36 | 
37 | >>> # train the LDA topic model
38 | >>> lda128 = shorttext.classifiers.LDAModeler()
39 | >>> lda128.train(subdict, 128)
40 | >>> # train the SVM classifier
41 | >>> from sklearn.svm import SVC
42 | >>> lda128_svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, SVC())
43 | >>> lda128_svm_classifier.train(subdict)
44 | 
45 | A test of its classification:
46 | 
47 | >>>  lda128_svm_classifier.score('linear algebra')
48 | {'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0}
49 | >>> lda128_svm_classifier.score('topological soliton')
50 | {'mathematics': 0.0, 'physics': 1.0, 'theology': 0.0}
51 | 
52 | Then we can implement the stacked generalization using logistic regression by calling:
53 | 
54 | >>> stacker = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier})
55 | >>> stacker.train(subdict)
56 | 
57 | Now the model is ready. As a result, we can do the stacked classification:
58 | 
59 | >>> stacker.score('linear algebra')
60 | {'mathematics': 0.55439126, 'physics': 0.036988281, 'theology': 0.039665185}
61 | >>> stacker.score('quantum mechanics')
62 | {'mathematics': 0.059210967, 'physics': 0.55031472, 'theology': 0.04532773}
63 | >>> stacker.score('topological dynamics')
64 | {'mathematics': 0.17244603, 'physics': 0.19720334, 'theology': 0.035309207}
65 | >>> stacker.score('christology')
66 |  {'mathematics': 0.094574735, 'physics': 0.053406414, 'theology': 0.3797417}
67 | 
68 | The stacked generalization can be saved by calling:
69 | 
70 | >>> stacker.save_compact_model('/path/to/logitmodel.bin')
71 | 
72 | This only saves the stacked generalization model, but not the intermediate classifiers.
73 | The reason for this is for allowing flexibility for users to supply their own algorithms,
74 | as long as they have the `score` functions which output the same way as the classifiers
75 | offered in this package. To load them, initialize it in the same way:
76 | 
77 | >>> stacker2 = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier})
78 | >>> stacker2.load_compact_model('/path/to/logitmodel.bin')
79 | 
80 | 
81 | .. automodule:: shorttext.stack.stacking
82 |    :members:
83 | 
84 | 
85 | Reference
86 | ---------
87 | 
88 | "Combining the Best of All Worlds," *Everything About Data Analytics*, WordPress (2016). [`WordPress
89 | <https://datawarrior.wordpress.com/2016/06/19/combining-the-best-of-all-worlds/>`_]
90 | 
91 | David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992).
92 | 
93 | M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
94 | *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
95 | 
96 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_sumvec.rst:
--------------------------------------------------------------------------------
  1 | Word-Embedding Cosine Similarity Classifier
  2 | ===========================================
  3 | 
  4 | Sum of Embedded Vectors
  5 | -----------------------
  6 | 
  7 | Given a pre-trained word-embedding models like Word2Vec, a classifier
  8 | based on cosine similarities can be built, which is
  9 | :class:`shorttext.classifiers.SumEmbeddedVecClassifier`.
 10 | In training the data,
 11 | the embedded vectors in every word in that class are averaged. The
 12 | score for a given text to each class is the cosine similarity between the averaged
 13 | vector of the given text and the precalculated vector of that class.
 14 | 
 15 | A pre-trained Google Word2Vec model can be downloaded `here
 16 | <https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
 17 | 
 18 | See: :doc:`tutorial_wordembed` .
 19 | 
 20 | Import the package:
 21 | 
 22 | >>> import shorttext
 23 | 
 24 | To load the Word2Vec model,
 25 | 
 26 | >>> from shorttext.utils import load_word2vec_model
 27 | >>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
 28 | 
 29 | Then we load a set of data:
 30 | 
 31 | >>> nihtraindata = shorttext.data.nihreports(sample_size=None)
 32 | 
 33 | Then initialize the classifier:
 34 | 
 35 | >>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel)   # for Google model, the vector size is 300 (default: 100)
 36 | >>> classifier.train(nihtraindata)
 37 | 
 38 | This classifier takes relatively little time to train compared with others
 39 | in this package. Then we can perform classification:
 40 | 
 41 | >>> classifier.score('bioinformatics')
 42 | 
 43 | Or the result can be sorted and only the five top-scored results are displayed:
 44 | 
 45 | >>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
 46 | [('NIGMS', 0.44962596182682935),
 47 |  ('NIAID', 0.4494126990050461),
 48 |  ('NINDS', 0.43435236806719524),
 49 |  ('NIDCR', 0.43042338197002483),
 50 |  ('NHGRI', 0.42878346869968731)]
 51 | >>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
 52 | [('NHGRI', 0.54200061864847038),
 53 |  ('NCATS', 0.49097267547279988),
 54 |  ('NIGMS', 0.47818129591411118),
 55 |  ('CIT', 0.46874987052158501),
 56 |  ('NLM', 0.46869259072562974)]
 57 | >>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
 58 | [('NCI', 0.53734097785976076),
 59 |  ('NIAID', 0.50616582142027433),
 60 |  ('NIDCR', 0.48596330887674788),
 61 |  ('NIDDK', 0.46875755765903215),
 62 |  ('NCCAM', 0.4642233792198418)]
 63 | 
 64 | The trained model can be saved:
 65 | 
 66 | >>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
 67 | 
 68 | And with the same pre-trained Word2Vec model, this classifier can be loaded:
 69 | 
 70 | >>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
 71 | 
 72 | .. autoclass:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification.SumEmbeddedVecClassifier
 73 |    :members:
 74 | 
 75 | 
 76 | Appendix: Model I/O in Previous Versions
 77 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 78 | 
 79 | In previous versions of `shorttext`, :class:`shorttext.classifiers.SumEmbeddedVecClassifier` has a `savemodel` method,
 80 | which runs as follow:
 81 | 
 82 | >>> classifier.savemodel('/path/to/nihdata')
 83 | 
 84 | This produces the following file for this model:
 85 | 
 86 | ::
 87 | 
 88 |     /path/to/nihdata_embedvecdict.pkl
 89 | 
 90 | It can be loaded by:
 91 | 
 92 | >>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/nihdata', compact=False)
 93 | 
 94 | Reference
 95 | ---------
 96 | 
 97 | Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs
 98 | <https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis>`_]
 99 | 
100 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_textpreprocessing.rst:
--------------------------------------------------------------------------------
 1 | Text Preprocessing
 2 | ==================
 3 | 
 4 | Standard Preprocessor
 5 | ---------------------
 6 | 
 7 | When the bag-of-words (BOW) model is used to represent the content, it is essential to
 8 | specify how the text is preprocessed before it is passed to the trainers or the
 9 | classifiers.
10 | 
11 | This package provides a standard way of text preprocessing, which goes through the
12 | following steps:
13 | 
14 | - removing special characters,
15 | - removing numerals,
16 | - converting all alphabets to lower cases,
17 | - removing stop words, and
18 | - stemming the words (using Snowball Porter stemmer).
19 | 
20 | To do this, load the preprocesser generator:
21 | 
22 | >>> from shorttext.utils import standard_text_preprocessor_1
23 | 
24 | Then define the preprocessor, a function, by just calling:
25 | 
26 | >>> preprocessor1 = standard_text_preprocessor_1()
27 | 
28 | .. automodule:: shorttext.utils.textpreprocessing
29 |     :members: standard_text_preprocessor_1
30 | 
31 | It is a function that perform the preprocessing in the steps above:
32 | 
33 | >>> preprocessor1('Maryland Blue Crab')  # output:  'maryland blue crab'
34 | >>> preprocessor1('filing electronic documents and goes home. eat!!!')   # output: 'file electron document goe home eat'
35 | 
36 | Customized Text Preprocessor
37 | ----------------------------
38 | 
39 | The standard preprocessor is good for many general natural language processing tasks,
40 | but some users may want to define their own preprocessors for their own purposes.
41 | This preprocessor is used in topic modeling, and is desired to be *a function that takes
42 | a string, and returns a string*.
43 | 
44 | If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
45 | the pipeline, which is a list of functions that input a string and return a string. For example,
46 | let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
47 | 2) convert it to upper case; and 3) tag the number of characters after each token.
48 | 
49 | Load the function that generates the preprocessor function:
50 | 
51 | >>> from shorttext.utils import text_preprocessor
52 | 
53 | Initialize a WordNet lemmatizer using `nltk`:
54 | 
55 | >>> from nltk.stem import WordNetLemmatizer
56 | >>> lemmatizer = WordNetLemmatizer()
57 | 
58 | Define the pipeline. Functions for each of the steps are:
59 | 
60 | >>> step1fcn = lambda s: ' '.join([lemmatizer.lemmatize(s1) for s1 in s.split(' ')])
61 | >>> step2fcn = lambda s: s.upper()
62 | >>> step3fcn = lambda s: ' '.join([s1+'-'+str(len(s1)) for s1 in s.split(' ')])
63 | 
64 | Then the pipeline is:
65 | 
66 | >>> pipeline = [step1fcn, step2fcn, step3fcn]
67 | 
68 | The preprocessor function can be generated with the defined pipeline:
69 | 
70 | >>> preprocessor2 = text_preprocessor(pipeline)
71 | 
72 | The function `preprocessor2` is a function that input a string and returns a string.
73 | Some examples are:
74 | 
75 | >>> preprocessor2('Maryland blue crab in Annapolis')  # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9'
76 | >>> preprocessor2('generative adversarial networks')  # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7'
77 | 
78 | .. automodule:: shorttext.utils.textpreprocessing
79 |     :members: text_preprocessor
80 | 
81 | Tokenization
82 | ------------
83 | 
84 | Users are free to choose any tokenizer they wish. In `shorttext`, the tokenizer is
85 | simply the space delimiter, and can be called:
86 | 
87 | >>> shorttext.utils.tokenize('Maryland blue crab')   # output: ['Maryland', 'blue', 'crab']
88 | 
89 | Reference
90 | ---------
91 | 
92 | Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press
93 | <https://mitpress.mit.edu/books/foundations-statistical-natural-language-processing>`_]
94 | 
95 | "R or Python on Text Mining," *Everything About Data Analytics*, WordPress (2015). [`WordPress
96 | <https://datawarrior.wordpress.com/2015/08/12/codienerd-1-r-or-python-on-text-mining>`_]
97 | 
98 | Home: :doc:`index`


--------------------------------------------------------------------------------
/docs/tutorial_wordembed.rst:
--------------------------------------------------------------------------------
  1 | Word Embedding Models
  2 | =====================
  3 | 
  4 | Word2Vec
  5 | --------
  6 | 
  7 | The most commonly used word-embedding model is Word2Vec. Its model can be downloaded from
  8 | their page. To load the model, call:
  9 | 
 10 | >>> import shorttext
 11 | >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
 12 | 
 13 | It is a binary file, and the default is set to be `binary=True`.
 14 | 
 15 | .. automodule:: shorttext.utils.wordembed
 16 |    :members: load_word2vec_model
 17 | 
 18 | It is equivalent to calling,
 19 | 
 20 | >>> import gensim
 21 | >>> wvmodel = gensim.models.KeyedVectors.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True)
 22 | 
 23 | 
 24 | Word2Vec is a neural network model that embeds words into semantic vectors that carry semantic meaning.
 25 | It is easy to extract the vector of a word, like for the word 'coffee':
 26 | 
 27 | >>> wvmodel['coffee']   # an ndarray for the word will be output
 28 | 
 29 | One can find the most similar words to 'coffee' according to this model:
 30 | 
 31 | >>> wvmodel.most_similar('coffee')
 32 | 
 33 | which outputs:
 34 | 
 35 | ::
 36 | 
 37 |     [(u'coffees', 0.721267819404602),
 38 |      (u'gourmet_coffee', 0.7057087421417236),
 39 |      (u'Coffee', 0.6900454759597778),
 40 |      (u'o_joe', 0.6891065835952759),
 41 |      (u'Starbucks_coffee', 0.6874972581863403),
 42 |      (u'coffee_beans', 0.6749703884124756),
 43 |      (u'latt\xe9', 0.664122462272644),
 44 |      (u'cappuccino', 0.662549614906311),
 45 |      (u'brewed_coffee', 0.6621608138084412),
 46 |      (u'espresso', 0.6616827249526978)]
 47 | 
 48 | Or if you want to find the cosine similarity between 'coffee' and 'tea', enter:
 49 | 
 50 | >>> wvmodel.similarity('coffee', 'tea')   # outputs: 0.56352921707810621
 51 | 
 52 | Semantic meaning can be reflected by their differences. For example, we can vaguely
 53 | say `Francis` - `Paris` = `Taiwan` - `Taipei`, or `man` - `actor` = `woman` - `actress`.
 54 | Define first the cosine similarity for readability:
 55 | 
 56 | >>> from scipy.spatial.distance import cosine
 57 | >>> similarity = lambda u, v: 1-cosine(u, v)
 58 | 
 59 | Then
 60 | 
 61 | >>> similarity(wvmodel['France'] + wvmodel['Taipei'] - wvmodel['Taiwan'], wvmodel['Paris'])  # outputs: 0.70574580801216202
 62 | >>> similarity(wvmodel['woman'] + wvmodel['actor'] - wvmodel['man'], wvmodel['actress'])  # outputs: 0.876354245612604
 63 | 
 64 | GloVe
 65 | -----
 66 | 
 67 | Stanford NLP Group developed a similar word-embedding algorithm, with a good theory explaining how
 68 | it works. It is extremely similar to Word2Vec.
 69 | 
 70 | One can convert a text-format GloVe model into a text-format Word2Vec model. More information can be found
 71 | in the documentation of `gensim`: `Converting GloVe to Word2Vec
 72 | <https://radimrehurek.com/gensim/scripts/glove2word2vec.html>`_
 73 | 
 74 | FastText
 75 | --------
 76 | 
 77 | FastText is a similar word-embedding model from Facebook. You can download pre-trained models here:
 78 | 
 79 | `Pre-trained word vectors
 80 | <https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md>`_
 81 | 
 82 | To load a pre-trained FastText model, run:
 83 | 
 84 | >>> import shorttext
 85 | >>> ftmodel = shorttext.utils.load_fasttext_model('/path/to/model.bin')
 86 | 
 87 | And it is used exactly the same way as Word2Vec.
 88 | 
 89 | .. automodule:: shorttext.utils.wordembed
 90 |    :members: load_fasttext_model
 91 | 
 92 | Poincaré Embeddings
 93 | -------------------
 94 | 
 95 | Poincaré embeddings is a new embedding that learns both semantic similarity and hierarchical structures. To load a
 96 | pre-trained model, run:
 97 | 
 98 | >>> import shorttext
 99 | >>> pemodel = shorttext.utils.load_poincare_model('/path/to/model.txt')
100 | 
101 | For preloaded word-embedding models, please refer to :doc:`tutorial_wordembed`.
102 | 
103 | .. automodule:: shorttext.utils.wordembed
104 |    :members: load_poincare_model
105 | 
106 | BERT
107 | ----
108 | 
109 | BERT_ (Bidirectional Transformers for Language Understanding)
110 | is a transformer-based language model. This package supports tokens
111 | and sentence embeddings using pre-trained language models, supported
112 | by the package written by HuggingFace_. In `shorttext`, to run:
113 | 
114 | >>> from shorttext.utils import WrappedBERTEncoder
115 | >>> encoder = WrappedBERTEncoder()   # the default model and tokenizer are loaded
116 | >>> sentences_embedding, tokens_embedding, tokens = encoder.encode_sentences(['The car should turn right.', 'The answer is right.'])
117 | 
118 | The third line returns the embeddings of all sentences, embeddings of all tokens in each sentence,
119 | and the tokens (with `CLS` and `SEP`) included. Unlike previous embeddings,
120 | token embeddings depend on the context; in the above example, the embeddings of the
121 | two "right"'s are different as they have different meanings.
122 | 
123 | The default BERT models and tokenizers are `bert-base_uncase`.
124 | If you want to use others, refer to `HuggingFace's model list
125 | <https://huggingface.co/models>`_ .
126 | 
127 | .. autoclass:: shorttext.utils.transformers.BERTObject
128 |    :members:
129 | 
130 | .. autoclass:: shorttext.utils.transformers.WrappedBERTEncoder
131 |    :members:
132 | 
133 | 
134 | Other Functions
135 | ---------------
136 | 
137 | .. automodule:: shorttext.utils.wordembed
138 |    :members: shorttext_to_avgvec
139 | 
140 | 
141 | Links
142 | -----
143 | 
144 | - Word2Vec_
145 | - GloVe_
146 | - FastText_
147 | - BERT_
148 | - HuggingFace_
149 | 
150 | Reference
151 | ---------
152 | 
153 | Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv
154 | <https://arxiv.org/abs/1810.04805>`_]
155 | 
156 | Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe
157 | <https://rare-technologies.com/implementing-poincare-embeddings/#h2-2>`_]
158 | 
159 | Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF
160 | <http://www.aclweb.org/anthology/D14-1162>`_]
161 | 
162 | Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv
163 | <https://arxiv.org/abs/1705.08039>`_]
164 | 
165 | Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv
166 | <https://arxiv.org/abs/1607.04606>`_]
167 | 
168 | Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv
169 | <https://arxiv.org/abs/1301.3781>`_]
170 | 
171 | Radim Řehůřek, "Making sense of word2vec," RaRe Technologies (2014). [`RaRe
172 | <https://rare-technologies.com/making-sense-of-word2vec/>`_]
173 | 
174 | "Probabilistic Theory of Word Embeddings: GloVe," *Everything About Data Analytics*, WordPress (2016). [`WordPress
175 | <https://datawarrior.wordpress.com/2016/07/25/probabilistic-theory-of-word-embeddings-glove/>`_]
176 | 
177 | "Toying with Word2Vec," *Everything About Data Analytics*, WordPress (2015). [`WordPress
178 | <https://datawarrior.wordpress.com/2015/10/25/codienerd-2-toying-with-word2vec/>`_]
179 | 
180 | "Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress
181 | <https://datawarrior.wordpress.com/2016/05/15/word-embedding-algorithms/>`_]
182 | 
183 | Home: :doc:`index`
184 | 
185 | .. _Word2Vec: https://code.google.com/archive/p/word2vec/
186 | .. _GloVe: http://nlp.stanford.edu/projects/glove/
187 | .. _FastText: https://github.com/facebookresearch/fastText
188 | .. _BERT: https://arxiv.org/abs/1810.04805
189 | .. _HuggingFace: https://huggingface.co/


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "shorttext"
 7 | version = "2.2.1"
 8 | authors = [
 9 |     {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"}
10 | ]
11 | description = "Short Text Mining"
12 | readme = {file = "README.md", content-type = "text/markdown"}
13 | license = {text = "MIT"}
14 | keywords = ["shorttext", "natural language processing", "text mining"]
15 | requires-python = ">=3.9"
16 | classifiers = [
17 |       "Topic :: Scientific/Engineering :: Artificial Intelligence",
18 |       "Topic :: Scientific/Engineering :: Mathematics",
19 |       "Topic :: Text Processing :: Linguistic",
20 |       "Topic :: Software Development :: Libraries :: Python Modules",
21 |       "Programming Language :: Python :: 3.9",
22 |       "Programming Language :: Python :: 3.10",
23 |       "Programming Language :: Python :: 3.11",
24 |       "Programming Language :: Python :: 3.12",
25 |       "Natural Language :: English",
26 |       "License :: OSI Approved :: MIT License",
27 |       "Intended Audience :: Developers",
28 |       "Intended Audience :: Education",
29 |       "Intended Audience :: Information Technology",
30 |       "Intended Audience :: Science/Research"
31 | ]
32 | dependencies = [
33 |     "numpy>=1.23.3",
34 |     "scipy>=1.12.0",
35 |     "joblib>=1.3.0",
36 |     "scikit-learn>=1.2.0",
37 |     "tensorflow>=2.13.0",
38 |     "keras>=2.13.0",
39 |     "gensim>=4.0.0",
40 |     "pandas>=1.2.0",
41 |     "snowballstemmer>=3.0.0",
42 |     "transformers>=4.39.0",
43 |     "torch>=2.0.0",
44 |     "numba>=0.57.0",
45 |     "deprecation>=2.0.0"
46 | ]
47 | 
48 | [project.urls]
49 | Repository = "https://github.com/stephenhky/PyShortTextCategorization"
50 | Issues = "https://github.com/stephenhky/PyShortTextCategorization/issues"
51 | Documentation = "https://shorttext.readthedocs.io"
52 | 
53 | [tool.setuptools]
54 | packages = [
55 |     "shorttext",
56 |     "shorttext.cli",
57 |     "shorttext.utils",
58 |     "shorttext.classifiers",
59 |     "shorttext.classifiers.embed",
60 |     "shorttext.classifiers.embed.nnlib",
61 |     "shorttext.classifiers.embed.sumvec",
62 |     "shorttext.classifiers.bow",
63 |     "shorttext.classifiers.bow.topic",
64 |     "shorttext.classifiers.bow.maxent",
65 |     "shorttext.data",
66 |     "shorttext.stack",
67 |     "shorttext.generators",
68 |     "shorttext.generators.bow",
69 |     "shorttext.generators.charbase",
70 |     "shorttext.generators.seq2seq",
71 |     "shorttext.metrics",
72 |     "shorttext.metrics.dynprog",
73 |     "shorttext.metrics.wasserstein",
74 |     "shorttext.metrics.transformers",
75 |     "shorttext.metrics.embedfuzzy",
76 |     "shorttext.spell"
77 | ]
78 | zip-safe = false
79 | 
80 | [project.scripts]
81 | ShortTextCategorizerConsole = "shorttext.cli.categorization:main"
82 | ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main"
83 | 
84 | [project.optional-dependencies]
85 | test = ["unittest2", "pytest", "simplerepresentations>=0.0.4"]
86 | 


--------------------------------------------------------------------------------
/shorttext/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import metrics
3 | from . import classifiers
4 | from . import data
5 | from . import generators
6 | from . import spell
7 | from . import stack
8 | from . import utils
9 | 


--------------------------------------------------------------------------------
/shorttext/classifiers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .embed import *
 2 | from .embed import SumEmbeddedVecClassifier, load_sumword2vec_classifier
 3 | from .embed import VarNNEmbeddedVecClassifier, load_varnnlibvec_classifier
 4 | from .embed import frameworks
 5 | from .embed.sumvec import frameworks as sumvecframeworks
 6 | 
 7 | from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier
 8 | from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier
 9 | from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier
10 | 
11 | from .bow.topic.SkLearnClassification import TopicVectorSkLearnClassifier
12 | from .bow.topic.SkLearnClassification import train_gensim_topicvec_sklearnclassifier, train_autoencoder_topic_sklearnclassifier
13 | from .bow.topic.SkLearnClassification import load_gensim_topicvec_sklearnclassifier, load_autoencoder_topic_sklearnclassifier
14 | 
15 | from .bow.maxent.MaxEntClassification import MaxEntClassifier, load_maxent_classifier


--------------------------------------------------------------------------------
/shorttext/classifiers/bow/__init__.py:
--------------------------------------------------------------------------------
1 | from . import topic
2 | from . import maxent


--------------------------------------------------------------------------------
/shorttext/classifiers/bow/maxent/__init__.py:
--------------------------------------------------------------------------------
1 | from . import MaxEntClassification


--------------------------------------------------------------------------------
/shorttext/classifiers/bow/topic/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import TopicVectorDistanceClassification
3 | from . import SkLearnClassification


--------------------------------------------------------------------------------
/shorttext/classifiers/embed/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import nnlib
 2 | from . import sumvec
 3 | 
 4 | from .nnlib import frameworks
 5 | from .nnlib.VarNNEmbedVecClassification import VarNNEmbeddedVecClassifier
 6 | from .nnlib.VarNNEmbedVecClassification import load_varnnlibvec_classifier
 7 | from .nnlib.frameworks import CNNWordEmbed, DoubleCNNWordEmbed, CLSTMWordEmbed
 8 | from .sumvec.frameworks import DenseWordEmbed
 9 | from .sumvec.SumEmbedVecClassification import SumEmbeddedVecClassifier
10 | from .sumvec.SumEmbedVecClassification import load_sumword2vec_classifier
11 | from .sumvec.VarNNSumEmbedVecClassification import VarNNSumEmbeddedVecClassifier
12 | 


--------------------------------------------------------------------------------
/shorttext/classifiers/embed/nnlib/__init__.py:
--------------------------------------------------------------------------------
1 | from . import VarNNEmbedVecClassification
2 | from . import frameworks


--------------------------------------------------------------------------------
/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from collections import defaultdict
  3 | 
  4 | import numpy as np
  5 | from scipy.spatial.distance import cosine
  6 | 
  7 | from ....utils.classification_exceptions import ModelNotTrainedException
  8 | from ....utils import shorttext_to_avgvec
  9 | from ....utils.compactmodel_io import CompactIOMachine
 10 | 
 11 | 
 12 | class SumEmbeddedVecClassifier(CompactIOMachine):
 13 |     """
 14 |     This is a supervised classification algorithm for short text categorization.
 15 |     Each class label has a few short sentences, where each token is converted
 16 |     to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model).
 17 |     They are then summed up and normalized to a unit vector for that particular class labels.
 18 |     To perform prediction, the input short sentences is converted to a unit vector
 19 |     in the same way. The similarity score is calculated by the cosine similarity.
 20 | 
 21 |     A pre-trained Google Word2Vec model can be downloaded `here
 22 |     <https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
 23 |     """
 24 | 
 25 |     def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)):
 26 |         """ Initialize the classifier.
 27 | 
 28 |         :param wvmodel: Word2Vec model
 29 |         :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
 30 |         :param simfcn: similarity function (Default: cosine similarity)
 31 |         :type wvmodel: gensim.models.keyedvectors.KeyedVectors
 32 |         :type vecsize: int
 33 |         :type simfcn: function
 34 |         """
 35 |         CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl'])
 36 |         self.wvmodel = wvmodel
 37 |         self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
 38 |         self.simfcn = simfcn
 39 |         self.trained = False
 40 | 
 41 |     def train(self, classdict):
 42 |         """ Train the classifier.
 43 | 
 44 |         If this has not been run, or a model was not loaded by :func:`~loadmodel`,
 45 |         a `ModelNotTrainedException` will be raised while performing prediction or saving
 46 |         the model.
 47 | 
 48 |         :param classdict: training data
 49 |         :return: None
 50 |         :type classdict: dict
 51 |         """
 52 |         self.addvec = defaultdict(lambda : np.zeros(self.vecsize))
 53 |         for classtype in classdict:
 54 |             self.addvec[classtype] = np.sum([self.shorttext_to_embedvec(shorttext)
 55 |                                              for shorttext in classdict[classtype]],
 56 |                                             axis=0)
 57 |             self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype])
 58 |         self.addvec = dict(self.addvec)
 59 |         self.trained = True
 60 | 
 61 |     def savemodel(self, nameprefix):
 62 |         """ Save the trained model into files.
 63 | 
 64 |         Given the prefix of the file paths, save the model into files, with name given by the prefix,
 65 |         and add "_embedvecdict.pickle" at the end. If there is no trained model, a `ModelNotTrainedException`
 66 |         will be thrown.
 67 | 
 68 |         :param nameprefix: prefix of the file path
 69 |         :return: None
 70 |         :type nameprefix: str
 71 |         :raise: ModelNotTrainedException
 72 |         """
 73 |         if not self.trained:
 74 |             raise ModelNotTrainedException()
 75 |         pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'wb'))
 76 | 
 77 |     def loadmodel(self, nameprefix):
 78 |         """ Load a trained model from files.
 79 | 
 80 |         Given the prefix of the file paths, load the model from files with name given by the prefix
 81 |         followed by "_embedvecdict.pickle".
 82 | 
 83 |         If this has not been run, or a model was not trained by :func:`~train`,
 84 |         a `ModelNotTrainedException` will be raised while performing prediction and saving the model.
 85 | 
 86 |         :param nameprefix: prefix of the file path
 87 |         :return: None
 88 |         :type nameprefix: str
 89 |         """
 90 |         self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'rb'))
 91 |         self.trained = True
 92 | 
 93 |     def shorttext_to_embedvec(self, shorttext):
 94 |         """ Convert the short text into an averaged embedded vector representation.
 95 | 
 96 |         Given a short sentence, it converts all the tokens into embedded vectors according to
 97 |         the given word-embedding model, sums
 98 |         them up, and normalize the resulting vector. It returns the resulting vector
 99 |         that represents this short sentence.
100 | 
101 |         :param shorttext: a short sentence
102 |         :return: an embedded vector that represents the short sentence
103 |         :type shorttext: str
104 |         :rtype: numpy.ndarray
105 |         """
106 |         return shorttext_to_avgvec(shorttext, self.wvmodel)
107 | 
108 |     def score(self, shorttext):
109 |         """ Calculate the scores for all the class labels for the given short sentence.
110 | 
111 |         Given a short sentence, calculate the classification scores for all class labels,
112 |         returned as a dictionary with key being the class labels, and values being the scores.
113 |         If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.
114 | 
115 |         If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.
116 | 
117 |         :param shorttext: a short sentence
118 |         :return: a dictionary with keys being the class labels, and values being the corresponding classification scores
119 |         :type shorttext: str
120 |         :rtype: dict
121 |         :raise: ModelNotTrainedException
122 |         """
123 |         if not self.trained:
124 |             raise ModelNotTrainedException()
125 |         vec = self.shorttext_to_embedvec(shorttext)
126 |         scoredict = {}
127 |         for classtype in self.addvec:
128 |             try:
129 |                 scoredict[classtype] = self.simfcn(vec, self.addvec[classtype])
130 |             except ValueError:
131 |                 scoredict[classtype] = np.nan
132 |         return scoredict
133 | 
134 | 
135 | def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None):
136 |     """ Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
137 | 
138 |     :param wvmodel: Word2Vec model
139 |     :param name: name (if compact=True) or prefix (if compact=False) of the file path
140 |     :param compact whether model file is compact (Default: True)
141 |     :param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model)
142 |     :return: the classifier
143 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
144 |     :type name: str
145 |     :type compact: bool
146 |     :type vecsize: int
147 |     :rtype: SumEmbeddedVecClassifier
148 |     """
149 |     classifier = SumEmbeddedVecClassifier(wvmodel, vecsize=vecsize)
150 |     if compact:
151 |         classifier.load_compact_model(name)
152 |     else:
153 |         classifier.loadmodel(name)
154 |     return classifier


--------------------------------------------------------------------------------
/shorttext/classifiers/embed/sumvec/__init__.py:
--------------------------------------------------------------------------------
1 | from . import SumEmbedVecClassification
2 | from . import VarNNSumEmbedVecClassification
3 | from . import frameworks


--------------------------------------------------------------------------------
/shorttext/classifiers/embed/sumvec/frameworks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tensorflow.keras.layers import Dense, Activation
 3 | from tensorflow.keras.models import Sequential
 4 | from tensorflow.keras.regularizers import l2
 5 | 
 6 | from ....utils.classification_exceptions import UnequalArrayLengthsException
 7 | 
 8 | 
 9 | def DenseWordEmbed(nb_labels,
10 |                    dense_nb_nodes=[],
11 |                    dense_actfcn=[],
12 |                    vecsize=300,
13 |                    reg_coef=0.1,
14 |                    final_activiation='softmax',
15 |                    optimizer='adam'):
16 |     """ Return layers of dense neural network.
17 | 
18 |     Return layers of dense neural network. This assumes the input to be a rank-1 vector.
19 | 
20 |     :param nb_labels: number of class labels
21 |     :param dense_nb_nodes: number of nodes in each later (Default: [])
22 |     :param dense_actfcn: activation functions for each layer (Default: [])
23 |     :param vecsize: length of the embedded vectors in the model (Default: 300)
24 |     :param reg_coef: regularization coefficient (Default: 0.1)
25 |     :param final_activiation: activation function of the final layer (Default: softmax)
26 |     :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam)
27 |     :return: keras sequential model for dense neural network
28 |     :type nb_labels: int
29 |     :type dense_nb_nodes: list
30 |     :type dense_actfcn: list
31 |     :type vecsize: int
32 |     :type reg_coef: float
33 |     :type final_activiation: str
34 |     :type optimizer: str
35 |     :rtype: keras.models.Model
36 |     """
37 |     if len(dense_nb_nodes)!=len(dense_actfcn):
38 |         raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn)
39 |     nb_layers = len(dense_nb_nodes)
40 | 
41 |     model = Sequential()
42 |     if nb_layers==0:
43 |         model.add(Dense(nb_labels, input_shape=(vecsize,), kernel_regularizer=l2(reg_coef)))
44 |     else:
45 |         model.add(Dense(dense_nb_nodes[0],
46 |                         input_shape=(vecsize,),
47 |                         activation=dense_actfcn[0],
48 |                         kernel_regularizer=l2(reg_coef))
49 |                   )
50 |         for nb_nodes, activation in zip(dense_nb_nodes[1:], dense_actfcn[1:]):
51 |             model.add(Dense(nb_nodes, activation=activation, kernel_regularizer=l2(reg_coef)))
52 |         model.add(Dense(nb_labels, kernel_regularizer=l2(reg_coef)))
53 | 
54 |     # final activation layer
55 |     model.add(Activation(final_activiation))
56 |     model.compile(loss='categorical_crossentropy', optimizer=optimizer)
57 | 
58 |     return model


--------------------------------------------------------------------------------
/shorttext/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenhky/PyShortTextCategorization/a7caf4edeb86b3b69a56632d24fa7ee56d12621d/shorttext/cli/__init__.py


--------------------------------------------------------------------------------
/shorttext/cli/categorization.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | from functools import partial
  4 | import argparse
  5 | import logging
  6 | 
  7 | from ..utils.compactmodel_io import get_model_classifier_name
  8 | from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
  9 | from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model
 10 | from ..smartload import smartload_compact_model
 11 | from ..classifiers import TopicVectorCosineDistanceClassifier
 12 | 
 13 | logging.basicConfig(level=logging.INFO)
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | allowed_classifiers = [
 17 |     'ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder',
 18 |     'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent'
 19 | ]
 20 | needembedded_classifiers = ['nnlibvec', 'sumvec']
 21 | topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder']
 22 | 
 23 | load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False)
 24 | load_poincare_binary_model = partial(load_poincare_model, binary=True)
 25 | 
 26 | typedict = {
 27 |     'word2vec': load_word2vec_model,
 28 |     'word2vec_nonbinary': load_word2vec_nonbinary_model,
 29 |     'fasttext': load_fasttext_model,
 30 |     'poincare': load_poincare_model,
 31 |     'poincare_binary': load_poincare_binary_model
 32 | }
 33 | 
 34 | 
 35 | def get_argparser():
 36 |     parser = argparse.ArgumentParser(
 37 |         description='Perform prediction on short text with a given trained model.'
 38 |     )
 39 |     parser.add_argument('model_filepath', help='Path of the trained (compact) model.')
 40 |     parser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model.')
 41 |     parser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
 42 |     parser.add_argument('--topn', type=int, default=10, help='Number of top results to show.')
 43 |     parser.add_argument('--inputtext', default=None, help='Single input text for classification. If omitted, will enter console mode.')
 44 |     parser.add_argument('--type', default='word2vec', choices=typedict.keys(),
 45 |                         help='Type of word-embedding model (default: word2vec)')
 46 |     return parser
 47 | 
 48 | # main block
 49 | def main():
 50 |     # argument parsing
 51 |     args = get_argparser().parse_args()
 52 | 
 53 |     # check if the model file is given
 54 |     if not os.path.exists(args.model_filepath):
 55 |         raise IOError(f'Model file "{args.model_filepath}" not found!')
 56 |     
 57 |     # get the name of the classifier
 58 |     logger.info('Retrieving classifier name...')
 59 |     classifier_name = get_model_classifier_name(args.model_filepath)
 60 | 
 61 |     if classifier_name not in allowed_classifiers:
 62 |         raise AlgorithmNotExistException(classifier_name)
 63 | 
 64 |     # load the Word2Vec model if necessary
 65 |     wvmodel = None
 66 |     if classifier_name in needembedded_classifiers:
 67 |         # check if the word embedding model is available
 68 |         if not os.path.exists(args.wv):
 69 |             raise WordEmbeddingModelNotExistException(args.wv)
 70 |         # if there, load it
 71 |         logger.info(f'Loading word-embedding model from {args.wv}...')
 72 |         wvmodel = typedict[args.type](args.wv)
 73 | 
 74 |     # load the classifier
 75 |     logger.info('Initializing the classifier...')
 76 |     if classifier_name in topicmodels:
 77 |         topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
 78 |         classifier = TopicVectorCosineDistanceClassifier(topicmodel)
 79 |     else:
 80 |         classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
 81 | 
 82 |     # predict single input or run in console mode
 83 |     if args.inputtext is not None:
 84 |         if len(args.inputtext.strip()) == 0:
 85 |             print('No input text provided.')
 86 |             return
 87 |         scoredict = classifier.score(args.inputtext)
 88 |         for label, score in sorted(scoredict.items(), key=lambda x: x[1], reverse=True)[:args.topn]:
 89 |             print(f'{label} : {score:.4f}')
 90 |     else:
 91 |         # Console 
 92 |         print('Enter text to classify (empty input to quit):')
 93 |         while True:
 94 |             shorttext = input('text> ').strip()
 95 |             if not shorttext:
 96 |                 break
 97 |             scoredict = classifier.score(shorttext)
 98 |             for label, score in sorted(scoredict.items(), key=lambda x: x[1], reverse=True)[:args.topn]:
 99 |                 print(f'{label} : {score:.4f}')
100 |         print('Done.')
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/shorttext/cli/wordembedsim.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import time
 4 | 
 5 | from scipy.spatial.distance import cosine
 6 | 
 7 | from ..metrics.embedfuzzy import jaccardscore_sents
 8 | from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
 9 | from ..utils import shorttext_to_avgvec
10 | from ..metrics.wasserstein import word_mover_distance
11 | from ..metrics.dynprog.jaccard import soft_jaccard_score
12 | 
13 | 
14 | typedict = {
15 |     'word2vec': load_word2vec_model,
16 |     'fasttext': load_fasttext_model,
17 |     'poincare': load_poincare_model
18 | }
19 | 
20 | 
21 | def getargparser():
22 |     parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.')
23 |     parser.add_argument('modelpath', help='Path of the Word2Vec model')
24 |     parser.add_argument('--type', default='word2vec',
25 |                         help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")')
26 |     return parser
27 | 
28 | 
29 | def main():
30 |     # argument parsing
31 |     args = getargparser().parse_args()
32 | 
33 |     # preload tokenizer
34 |     tokenize('Mogu is cute.')
35 | 
36 |     time0 = time.time()
37 |     print("Loading "+args.type+"   model: "+args.modelpath)
38 |     wvmodel = typedict[args.type](args.modelpath)
39 |     time1 = time.time()
40 |     end = False
41 |     print("... loading time: "+str(time1 - time0)+" seconds")
42 | 
43 |     while not end:
44 |         sent1 = input('sent1> ')
45 |         if len(sent1)==0:
46 |             end = True
47 |         else:
48 |             sent2 = input('sent2> ')
49 | 
50 |             # output results
51 |             print("Cosine Similarity = %.4f" % (1 - cosine(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel))))
52 |             print("Word-embedding Jaccard Score Similarity = %.4f" % jaccardscore_sents(sent1, sent2, wvmodel))
53 |             print("Word Mover's Distance = %.4f" % word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel))
54 |             print("Soft Jaccard Score (edit distance) = %.4f" % soft_jaccard_score(tokenize(sent1), tokenize(sent2)))
55 | 
56 | 


--------------------------------------------------------------------------------
/shorttext/data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .data_retrieval import subjectkeywords, nihreports, inaugural, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts
3 | 


--------------------------------------------------------------------------------
/shorttext/data/data_retrieval.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | from collections import defaultdict
  4 | import json
  5 | import os
  6 | import zipfile
  7 | import sys
  8 | import csv
  9 | from urllib.request import urlretrieve
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | 
 15 | def retrieve_csvdata_as_dict(filepath):
 16 |     """ Retrieve the training data in a CSV file.
 17 | 
 18 |     Retrieve the training data in a CSV file, with the first column being the
 19 |     class labels, and second column the text data. It returns a dictionary with
 20 |     the class labels as keys, and a list of short texts as the value for each key.
 21 | 
 22 |     :param filepath: path of the training data (CSV)
 23 |     :return: a dictionary with class labels as keys, and lists of short texts
 24 |     :type filepath: str
 25 |     :rtype: dict
 26 |     """
 27 |     datafile = open(filepath, 'r')
 28 |     reader = csv.reader(datafile)
 29 |     headerread = False
 30 |     shorttextdict = defaultdict(lambda: [])
 31 |     for label, content in reader:
 32 |         if headerread:
 33 |             if isinstance(content, str):
 34 |                 shorttextdict[label] += [content]
 35 |         else:
 36 |             headerread = True
 37 |     return dict(shorttextdict)
 38 | 
 39 | 
 40 | def retrieve_jsondata_as_dict(filepath):
 41 |     """ Retrieve the training data in a JSON file.
 42 | 
 43 |     Retrieve the training data in a JSON file, with
 44 |     the class labels as keys, and a list of short texts as the value for each key.
 45 |     It returns the corresponding dictionary.
 46 | 
 47 |     :param filepath: path of the training data (JSON)
 48 |     :return: a dictionary with class labels as keys, and lists of short texts
 49 |     :type filepath: str
 50 |     :rtype: dict
 51 |     """
 52 |     return json.load(open(filepath, 'r'))
 53 | 
 54 | 
 55 | def subjectkeywords():
 56 |     """ Return an example data set of subjects.
 57 | 
 58 |     Return an example data set, with three subjects and corresponding keywords.
 59 |     This is in the format of the training input.
 60 | 
 61 |     :return: example data set
 62 |     :rtype: dict
 63 |     """
 64 |     this_dir, _ = os.path.split(__file__)
 65 |     return retrieve_csvdata_as_dict(os.path.join(this_dir, 'shorttext_exampledata.csv'))
 66 | 
 67 | 
 68 | def inaugural():
 69 |     """ Return an example dataset, which is the Inaugural Addresses of all Presidents of 
 70 |     the United States from George Washington to Barack Obama.
 71 |     
 72 |     Each key is the year, a dash, and the last name of the president. The content is
 73 |     the list of all the sentences
 74 |     
 75 |     :return: example data set
 76 |     :rtype: dict
 77 |     """
 78 |     zfile = zipfile.ZipFile(get_or_download_data("USInaugural.zip",
 79 |                                                  "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip",
 80 |                                                  asbytes=True),
 81 |                             )
 82 |     address_jsonstr = zfile.open("addresses.json").read()
 83 |     zfile.close()
 84 |     return json.loads(address_jsonstr.decode('utf-8'))
 85 | 
 86 | 
 87 | def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512):
 88 |     """ Return an example data set, sampled from NIH RePORT (Research Portfolio
 89 |     Online Reporting Tools).
 90 | 
 91 |     Return an example data set from NIH (National Institutes of Health),
 92 |     data publicly available from their RePORT
 93 |     website. (`link
 94 |     <https://exporter.nih.gov/ExPORTER_Catalog.aspx>`_).
 95 |     The data is with `txt_col` being either project titles ('PROJECT_TITLE')
 96 |     or proposal abstracts ('ABSTRACT_TEXT'), and label_col being the names of the ICs (Institutes or Centers),
 97 |     with 'IC_NAME' the whole form, and 'FUNDING_ICs' the abbreviated form).
 98 | 
 99 |     Dataset directly adapted from the NIH data from `R` package `textmineR
100 |     <https://cran.r-project.org/web/packages/textmineR/index.html>`_.
101 | 
102 |     :param txt_col: column for the text (Default: 'PROJECT_TITLE')
103 |     :param label_col: column for the labels (Default: 'FUNDING_ICs')
104 |     :param sample_size: size of the sample. Set to None if all rows. (Default: 512)
105 |     :return: example data set
106 |     :type txt_col: str
107 |     :type label_col: str
108 |     :type sample_size: int
109 |     :rtype: dict
110 |     """
111 |     # validation
112 |     # txt_col = 'PROJECT_TITLE' or 'ABSTRACT_TEXT'
113 |     # label_col = 'FUNDING_ICs' or 'IC_NAME'
114 |     if not (txt_col in ['PROJECT_TITLE', 'ABSTRACT_TEXT']):
115 |         raise KeyError('Undefined text column: '+txt_col+'. Must be PROJECT_TITLE or ABSTRACT_TEXT.')
116 |     if not (label_col in ['FUNDING_ICs', 'IC_NAME']):
117 |         raise KeyError('Undefined label column: '+label_col+'. Must be FUNDING_ICs or IC_NAME.')
118 | 
119 |     zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip',
120 |                                                  'https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/nih_full.csv.zip',
121 |                                                  asbytes=True),
122 |                             'r',
123 |                             zipfile.ZIP_DEFLATED)
124 |     nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col], encoding='cp437')
125 |     zfile.close()
126 |     nb_data = len(nih)
127 |     sample_size = nb_data if sample_size==None else min(nb_data, sample_size)
128 | 
129 |     classdict = defaultdict(lambda : [])
130 | 
131 |     for rowidx in np.random.randint(nb_data, size=min(nb_data, sample_size)):
132 |         label = nih.iloc[rowidx, nih.columns.get_loc(label_col)]
133 |         if label_col=='FUNDING_ICs':
134 |             if label=='':
135 |                 label = 'OTHER'
136 |             else:
137 |                 endpos = label.index(':')
138 |                 label = label[:endpos]
139 |         classdict[label] += [nih.iloc[rowidx, nih.columns.get_loc(txt_col)]]
140 | 
141 |     return dict(classdict)
142 | 
143 | 
144 | def mergedict(dicts):
145 |     """ Merge data dictionary.
146 | 
147 |     Merge dictionaries of the data in the training data format.
148 | 
149 |     :param dicts: dicts to merge
150 |     :return: merged dict
151 |     :type dicts: list
152 |     :rtype: dict
153 |     """
154 |     mdict = defaultdict(lambda : [])
155 |     for thisdict in dicts:
156 |         for label in thisdict:
157 |             mdict[label] += thisdict[label]
158 |     return dict(mdict)
159 | 
160 | 
161 | def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False):
162 |     """ Yielding test data and training data for cross validation by partitioning it.
163 | 
164 |     Given a training data, partition the data into portions, each will be used as test
165 |     data set, while the other training data set. It returns a generator.
166 | 
167 |     :param classdict: training data
168 |     :param nb_partitions: number of partitions
169 |     :param shuffle: whether to shuffle the data before partitioning
170 |     :return: generator, producing a test data set and a training data set each time
171 |     :type classdict: dict
172 |     :type nb_partitions: int
173 |     :type shuffle: bool
174 |     :rtype: generator
175 |     """
176 |     crossvaldicts = []
177 |     for _ in range(nb_partitions):
178 |         crossvaldicts.append(defaultdict(lambda: []))
179 | 
180 |     for label in classdict:
181 |         nb_data = len(classdict[label])
182 |         partsize = nb_data / nb_partitions
183 |         sentences = classdict[label] if not shuffle else random.shuffle(sentences)
184 |         for i in range(nb_partitions):
185 |             crossvaldicts[i][label] += sentences[i * partsize:min(nb_data, (i + 1) * partsize)]
186 |     crossvaldicts = [dict(crossvaldict) for crossvaldict in crossvaldicts]
187 | 
188 |     for i in range(nb_partitions):
189 |         testdict = crossvaldicts[i]
190 |         traindict = mergedict([crossvaldicts[j] for j in range(nb_partitions) if j != i])
191 |         yield testdict, traindict
192 | 
193 | 
194 | def get_or_download_data(filename, origin, asbytes=False):
195 |     # determine path
196 |     homedir = os.path.expanduser('~')
197 |     datadir = os.path.join(homedir, '.shorttext')
198 |     if not os.path.exists(datadir):
199 |         os.makedirs(datadir)
200 | 
201 |     targetfilepath = os.path.join(datadir, filename)
202 |     # download if not exist
203 |     if not os.path.exists(os.path.join(datadir, filename)):
204 |         print('Downloading...')
205 |         print('Source: ', origin)
206 |         print('Target: ', targetfilepath)
207 |         try:
208 |             urlretrieve(origin, targetfilepath)
209 |         except:
210 |             print('Failure to download file!')
211 |             print(sys.exc_info())
212 |             os.remove(targetfilepath)
213 | 
214 |     # return
215 |     return open(targetfilepath, 'rb' if asbytes else 'r')
216 | 


--------------------------------------------------------------------------------
/shorttext/data/shorttext_exampledata.csv:
--------------------------------------------------------------------------------
 1 | subject,content
 2 | mathematics,linear algebra
 3 | mathematics,topology
 4 | mathematics,algebra
 5 | mathematics,calculus
 6 | mathematics,variational calculus
 7 | mathematics,functional field
 8 | mathematics,real analysis
 9 | mathematics,complex analysis
10 | mathematics,differential equation
11 | mathematics,statistics
12 | mathematics,statistical optimization
13 | mathematics,probability
14 | mathematics,stochastic calculus
15 | mathematics,numerical analysis
16 | mathematics,differential geometry
17 | physics,renormalization
18 | physics,classical mechanics
19 | physics,quantum mechanics
20 | physics,statistical mechanics
21 | physics,functional field
22 | physics,path integral
23 | physics,quantum field theory
24 | physics,electrodynamics
25 | physics,condensed matter
26 | physics,particle physics
27 | physics,topological solitons
28 | physics,astrophysics
29 | physics,spontaneous symmetry breaking
30 | physics,atomic molecular and optical physics
31 | physics,quantum chaos
32 | theology,divine providence
33 | theology,soteriology
34 | theology,anthropology
35 | theology,pneumatology
36 | theology,Christology
37 | theology,Holy Trinity
38 | theology,eschatology
39 | theology,scripture
40 | theology,ecclesiology
41 | theology,predestination
42 | theology,divine degree
43 | theology,creedal confessionalism
44 | theology,scholasticism
45 | theology,prayer
46 | theology,eucharist


--------------------------------------------------------------------------------
/shorttext/generators/__init__.py:
--------------------------------------------------------------------------------
 1 | from .bow.GensimTopicModeling import load_gensimtopicmodel
 2 | from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel
 3 | 
 4 | from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
 5 | from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
 6 | 
 7 | from .charbase.char2vec import SentenceToCharVecEncoder, initSentenceToCharVecEncoder
 8 | from .seq2seq.s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras
 9 | from .seq2seq.charbaseS2S import CharBasedSeq2SeqGenerator, loadCharBasedSeq2SeqGenerator
10 | 


--------------------------------------------------------------------------------
/shorttext/generators/bow/LatentTopicModeling.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | import numpy as np
  5 | 
  6 | from ...utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e
  7 | from ...utils.textpreprocessing import tokenize
  8 | 
  9 | # abstract class
 10 | class LatentTopicModeler(ABC):
 11 |     """
 12 |     Abstract class for various topic modeler.
 13 |     """
 14 |     def __init__(self,
 15 |                  preprocessor=textpreprocess.standard_text_preprocessor_1(),
 16 |                  normalize=True):
 17 |         """ Initialize the modeler.
 18 | 
 19 |         :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
 20 |         :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
 21 |         :type preprocessor: function
 22 |         :type normalize: bool
 23 |         """
 24 |         self.preprocessor = preprocessor
 25 |         self.normalize = normalize
 26 |         self.trained = False
 27 | 
 28 |     def generate_corpus(self, classdict):
 29 |         """ Calculate the gensim dictionary and corpus, and extract the class labels
 30 |         from the training data. Called by :func:`~train`.
 31 | 
 32 |         :param classdict: training data
 33 |         :return: None
 34 |         :type classdict: dict
 35 |         """
 36 |         self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict,
 37 |                                                                                     preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent)))
 38 |     @abstractmethod
 39 |     def train(self, classdict, nb_topics, *args, **kwargs):
 40 |         """ Train the modeler.
 41 | 
 42 |         This is an abstract method of this abstract class, which raise the `NotImplementedException`.
 43 | 
 44 |         :param classdict: training data
 45 |         :param nb_topics: number of latent topics
 46 |         :param args: arguments to be passed into the wrapped training functions
 47 |         :param kwargs: arguments to be passed into the wrapped training functions
 48 |         :return: None
 49 |         :raise: NotImplementedException
 50 |         :type classdict: dict
 51 |         :type nb_topics: int
 52 |         """
 53 |         self.nb_topics = nb_topics
 54 |         raise e.NotImplementedException()
 55 | 
 56 |     def retrieve_bow(self, shorttext):
 57 |         """ Calculate the gensim bag-of-words representation of the given short text.
 58 | 
 59 |         :param shorttext: text to be represented
 60 |         :return: corpus representation of the text
 61 |         :type shorttext: str
 62 |         :rtype: list
 63 |         """
 64 |         return self.dictionary.doc2bow(tokenize(self.preprocessor(shorttext)))
 65 | 
 66 |     def retrieve_bow_vector(self, shorttext, normalize=True):
 67 |         """ Calculate the vector representation of the bag-of-words in terms of numpy.ndarray.
 68 | 
 69 |         :param shorttext: short text
 70 |         :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
 71 |         :return: vector represtation of the text
 72 |         :type shorttext: str
 73 |         :type normalize: bool
 74 |         :rtype: numpy.ndarray
 75 |         """
 76 |         bow = self.retrieve_bow(shorttext)
 77 |         vec = np.zeros(len(self.dictionary))
 78 |         for id, val in bow:
 79 |             vec[id] = val
 80 |         if normalize:
 81 |             vec /= np.linalg.norm(vec)
 82 |         return vec
 83 | 
 84 |     @abstractmethod
 85 |     def retrieve_topicvec(self, shorttext):
 86 |         """ Calculate the topic vector representation of the short text.
 87 | 
 88 |         This is an abstract method of this abstract class, which raise the `NotImplementedException`.
 89 | 
 90 |         :param shorttext: short text
 91 |         :return: topic vector
 92 |         :raise: NotImplementedException
 93 |         :type shorttext: str
 94 |         :rtype: numpy.ndarray
 95 |         """
 96 |         raise e.NotImplementedException()
 97 | 
 98 |     @abstractmethod
 99 |     def get_batch_cos_similarities(self, shorttext):
100 |         """ Calculate the cosine similarities of the given short text and all the class labels.
101 | 
102 |         This is an abstract method of this abstract class, which raise the `NotImplementedException`.
103 | 
104 |         :param shorttext: short text
105 |         :return: topic vector
106 |         :raise: NotImplementedException
107 |         :type shorttext: str
108 |         :rtype: numpy.ndarray
109 |         """
110 |         raise e.NotImplementedException()
111 | 
112 |     def __getitem__(self, shorttext):
113 |         return self.retrieve_topicvec(shorttext)
114 | 
115 |     def __contains__(self, shorttext):
116 |         if not self.trained:
117 |             raise e.ModelNotTrainedException()
118 |         return True
119 | 
120 |     @abstractmethod
121 |     def loadmodel(self, nameprefix):
122 |         """ Load the model from files.
123 | 
124 |         This is an abstract method of this abstract class, which raise the `NotImplementedException`.
125 | 
126 |         :param nameprefix: prefix of the paths of the model files
127 |         :return: None
128 |         :raise: NotImplementedException
129 |         :type nameprefix: str
130 |         """
131 |         raise e.NotImplementedException()
132 | 
133 |     @abstractmethod
134 |     def savemodel(self, nameprefix):
135 |         """ Save the model to files.
136 | 
137 |         This is an abstract method of this abstract class, which raise the `NotImplementedException`.
138 | 
139 |         :param nameprefix: prefix of the paths of the model files
140 |         :return: None
141 |         :raise: NotImplementedException
142 |         :type nameprefix: str
143 |         """
144 |         raise e.NotImplementedException()


--------------------------------------------------------------------------------
/shorttext/generators/bow/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import AutoEncodingTopicModeling
3 | from . import GensimTopicModeling
4 | from . import LatentTopicModeling
5 | 


--------------------------------------------------------------------------------
/shorttext/generators/charbase/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import char2vec
3 | 
4 | 


--------------------------------------------------------------------------------
/shorttext/generators/charbase/char2vec.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from functools import partial
  3 | 
  4 | import numpy as np
  5 | from scipy.sparse import csc_matrix
  6 | from gensim.corpora import Dictionary
  7 | from sklearn.preprocessing import OneHotEncoder
  8 | 
  9 | from ...utils.misc import textfile_generator
 10 | 
 11 | 
 12 | class SentenceToCharVecEncoder:
 13 |     """ A class that facilitates one-hot encoding from characters to vectors.
 14 | 
 15 |     """
 16 |     def __init__(self, dictionary, signalchar='\n'):
 17 |         """ Initialize the one-hot encoding class.
 18 | 
 19 |         :param dictionary: a gensim dictionary
 20 |         :param signalchar: signal character, useful for seq2seq models (Default: '\n')
 21 |         :type dictionary: gensim.corpora.Dictionary
 22 |         :type signalchar: str
 23 |         """
 24 |         self.dictionary = dictionary
 25 |         self.signalchar = signalchar
 26 |         numchars = len(self.dictionary)
 27 |         self.onehot_encoder = OneHotEncoder()
 28 |         self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
 29 | 
 30 |     def calculate_prelim_vec(self, sent):
 31 |         """ Convert the sentence to a one-hot vector.
 32 | 
 33 |         :param sent: sentence
 34 |         :return: a one-hot vector, with each element the code of that character
 35 |         :type sent: str
 36 |         :rtype: numpy.array
 37 |         """
 38 |         return self.onehot_encoder.transform(
 39 |             np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1))
 40 |         )
 41 | 
 42 |     def encode_sentence(self, sent, maxlen, startsig=False, endsig=False):
 43 |         """ Encode one sentence to a sparse matrix, with each row the expanded vector of each character.
 44 | 
 45 |         :param sent: sentence
 46 |         :param maxlen: maximum length of the sentence
 47 |         :param startsig: signal character at the beginning of the sentence (Default: False)
 48 |         :param endsig: signal character at the end of the sentence (Default: False)
 49 |         :return: matrix representing the sentence
 50 |         :type sent: str
 51 |         :type maxlen: int
 52 |         :type startsig: bool
 53 |         :type endsig: bool
 54 |         :rtype: scipy.sparse.csc_matrix
 55 |         """
 56 |         cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '')
 57 |         sent_vec = self.calculate_prelim_vec(cor_sent).tocsc()
 58 |         if sent_vec.shape[0] == maxlen + startsig + endsig:
 59 |             return sent_vec
 60 |         else:
 61 |             return csc_matrix((sent_vec.data, sent_vec.indices, sent_vec.indptr),
 62 |                               shape=(maxlen + startsig + endsig, sent_vec.shape[1]),
 63 |                               dtype=np.float64)
 64 | 
 65 |     def encode_sentences(self, sentences, maxlen, sparse=True, startsig=False, endsig=False):
 66 |         """ Encode many sentences into a rank-3 tensor.
 67 | 
 68 |         :param sentences: sentences
 69 |         :param maxlen: maximum length of one sentence
 70 |         :param sparse: whether to return a sparse matrix (Default: True)
 71 |         :param startsig: signal character at the beginning of the sentence (Default: False)
 72 |         :param endsig: signal character at the end of the sentence (Default: False)
 73 |         :return: rank-3 tensor of the sentences
 74 |         :type sentences: list
 75 |         :type maxlen: int
 76 |         :type sparse: bool
 77 |         :type startsig: bool
 78 |         :type endsig: bool
 79 |         :rtype: scipy.sparse.csc_matrix or numpy.array
 80 |         """
 81 |         encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen)
 82 |         list_encoded_sentences_map = map(encode_sent_func, sentences)
 83 |         if sparse:
 84 |             return list(list_encoded_sentences_map)
 85 |         else:
 86 |             return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map])
 87 | 
 88 |     def __len__(self):
 89 |         return len(self.dictionary)
 90 | 
 91 | 
 92 | def initSentenceToCharVecEncoder(textfile, encoding=None):
 93 |     """ Instantiate a class of SentenceToCharVecEncoder from a text file.
 94 | 
 95 |     :param textfile: text file
 96 |     :param encoding: encoding of the text file (Default: None)
 97 |     :return: an instance of SentenceToCharVecEncoder
 98 |     :type textfile: file
 99 |     :type encoding: str
100 |     :rtype: SentenceToCharVecEncoder
101 |     """
102 |     dictionary = Dictionary(map(lambda line: [c for c in line], textfile_generator(textfile, encoding=encoding)))
103 |     return SentenceToCharVecEncoder(dictionary)
104 | 


--------------------------------------------------------------------------------
/shorttext/generators/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import s2skeras
3 | from . import charbaseS2S
4 | 


--------------------------------------------------------------------------------
/shorttext/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import dynprog
3 | from . import embedfuzzy
4 | from . import transformers
5 | from . import wasserstein
6 | 


--------------------------------------------------------------------------------
/shorttext/metrics/dynprog/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import dldist
3 | from . import jaccard
4 | from . import lcp
5 | 


--------------------------------------------------------------------------------
/shorttext/metrics/dynprog/dldist.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import numba as nb
 4 | 
 5 | 
 6 | @nb.njit
 7 | def damerau_levenshtein(word1: str, word2: str) -> int:
 8 |     """ Calculate the Demarau-Levenshtein (DL) distance between two words.
 9 | 
10 |     :param word1: first word
11 |     :param word2: seccond word
12 |     :return: Damerau-Levenshtein (DL) distance
13 |     :type word1: str
14 |     :type word2: str
15 |     :rtype: int
16 |     """
17 |     len1 = len(word1)
18 |     len2 = len(word2)
19 |     matrix = np.zeros((len1+1, len2+1), dtype=np.int8)
20 | 
21 |     for i in range(len1+1):
22 |         matrix[i, 0] = i
23 |     for j in range(len2+1):
24 |         matrix[0, j] = j
25 | 
26 |     for i in range(len1+1):
27 |         for j in range(len2+1):
28 |             cost = 0
29 |             if i > 0 and j > 0 and (word1[i-1] != word2[j-1]):
30 |                 cost = 1
31 |             delcost = matrix[i-1, j] + 1
32 |             inscost = matrix[i, j-1] + 1
33 |             subcost = matrix[i-1, j-1] + cost
34 |             score = min(min(delcost, inscost), subcost)
35 |             if ((i > 1) & (j > 1) & (word1[i - 1] == word2[j - 2]) & (word1[i - 2] == word2[j - 1])):
36 |                 score = min(score, matrix[i-2, j-2] + cost)
37 |             matrix[i, j] = score
38 | 
39 |     print(matrix)
40 | 
41 |     return matrix[len1, len2]
42 | 


--------------------------------------------------------------------------------
/shorttext/metrics/dynprog/jaccard.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import product
 3 | 
 4 | from .dldist import damerau_levenshtein
 5 | from .lcp import longest_common_prefix
 6 | 
 7 | 
 8 | def similarity(word1, word2):
 9 |     """ Return the similarity between the two words.
10 | 
11 |     Return the similarity between the two words, between 0 and 1 inclusively.
12 |     The similarity is the maximum of the two values:
13 |     - 1 - Damerau-Levenshtein distance between two words / maximum length of the two words
14 |     - longest common prefix of the two words / maximum length of the two words
15 | 
16 |     Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
17 |     <http://ieeexplore.ieee.org/abstract/document/6881904/>`_]
18 | 
19 |     :param word1: a word
20 |     :param word2: a word
21 |     :return: similarity, between 0 and 1 inclusively
22 |     :type word1: str
23 |     :type word2: str
24 |     :rtype: float
25 |     """
26 |     maxlen = max(len(word1), len(word2))
27 |     editdistance = damerau_levenshtein(word1, word2)
28 |     lcp = longest_common_prefix(word1, word2)
29 |     return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen)
30 | 
31 | 
32 | def soft_intersection_list(tokens1, tokens2):
33 |     """ Return the soft number of intersections between two lists of tokens.
34 | 
35 |     :param tokens1: list of tokens.
36 |     :param tokens2: list of tokens.
37 |     :return: soft number of intersections.
38 |     :type tokens1: list
39 |     :type tokens2: list
40 |     :rtype: float
41 |     """
42 |     intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)]
43 |     intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True)
44 | 
45 |     included_list = set()
46 |     used_tokens1 = set()
47 |     used_tokens2 = set()
48 |     for (token1, token2), sim in intersected_list:
49 |         if (not (token1 in used_tokens1)) and (not (token2 in used_tokens2)):
50 |             included_list.add(((token1, token2), sim))
51 |             used_tokens1.add(token1)
52 |             used_tokens2.add(token2)
53 | 
54 |     return included_list
55 | 
56 | 
57 | def soft_jaccard_score(tokens1, tokens2):
58 |     """ Return the soft Jaccard score of the two lists of tokens, between 0 and 1 inclusively.
59 | 
60 |     Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
61 |     <http://ieeexplore.ieee.org/abstract/document/6881904/>`_]
62 | 
63 |     :param tokens1: list of tokens.
64 |     :param tokens2: list of tokens.
65 |     :return: soft Jaccard score, between 0 and 1 inclusively.
66 |     :type tokens1: list
67 |     :type tokens2: list
68 |     :rtype: float
69 |     """
70 |     intersection_list = soft_intersection_list(tokens1, tokens2)
71 |     num_intersections = sum([item[1] for item in intersection_list])
72 |     num_unions = len(tokens1) + len(tokens2) - num_intersections
73 |     return float(num_intersections)/float(num_unions)
74 | 


--------------------------------------------------------------------------------
/shorttext/metrics/dynprog/lcp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numba as nb
 3 | 
 4 | 
 5 | @nb.njit
 6 | def longest_common_prefix(word1: str, word2: str) -> int:
 7 |     """ Calculate the longest common prefix (LCP) between two words.
 8 | 
 9 |     :param word1: first word
10 |     :param word2: seccond word
11 |     :return: longest common prefix (LCP)
12 |     :type word1: str
13 |     :type word2: str
14 |     :rtype: int
15 |     """
16 |     lcp = 0
17 |     for i in range(min(len(word1), len(word2))):
18 |         if word1[i] == word2[i]:
19 |             lcp += 1
20 |         else:
21 |             break
22 |     return lcp
23 | 


--------------------------------------------------------------------------------
/shorttext/metrics/embedfuzzy/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .jaccard import jaccardscore_sents


--------------------------------------------------------------------------------
/shorttext/metrics/embedfuzzy/jaccard.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import product
 3 | 
 4 | import numpy as np
 5 | from scipy.spatial.distance import cosine
 6 | 
 7 | from ...utils import tokenize
 8 | 
 9 | 
10 | def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)):
11 |     """ Compute the Jaccard score between sentences based on their word similarities.
12 | 
13 |     :param sent1: first sentence
14 |     :param sent2: second sentence
15 |     :param wvmodel: word-embeding model
16 |     :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine)
17 |     :return: soft Jaccard score
18 |     :type sent1: str
19 |     :type sent2: str
20 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
21 |     :type sim_words: function
22 |     :rtype: float
23 |     """
24 |     tokens1 = tokenize(sent1)
25 |     tokens2 = tokenize(sent2)
26 |     tokens1 = list(filter(lambda w: w in wvmodel, tokens1))
27 |     tokens2 = list(filter(lambda w: w in wvmodel, tokens2))
28 |     allowable1 = [True] * len(tokens1)
29 |     allowable2 = [True] * len(tokens2)
30 | 
31 |     simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]])
32 |                for i, j in product(range(len(tokens1)), range(len(tokens2)))}
33 | 
34 |     intersection = 0.0
35 |     simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True)
36 |     for idxtuple, sim in simdictitems:
37 |         i, j = idxtuple
38 |         if allowable1[i] and allowable2[j]:
39 |             intersection += sim
40 |             allowable1[i] = False
41 |             allowable2[j] = False
42 | 
43 |     union = len(tokens1) + len(tokens2) - intersection
44 | 
45 |     if union > 0:
46 |         return intersection / union
47 |     elif intersection == 0:
48 |         return 1.
49 |     else:
50 |         return np.inf
51 | 


--------------------------------------------------------------------------------
/shorttext/metrics/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .bertscore import BERTScorer
3 | 


--------------------------------------------------------------------------------
/shorttext/metrics/transformers/bertscore.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from itertools import product
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from ...utils.transformers import WrappedBERTEncoder
  7 | 
  8 | 
  9 | class BERTScorer:
 10 |     """ This is the class that compute the BERTScores between sentences. BERTScores
 11 |     include recall BERTScores, precision BERTScores, and F1 BERTSscores.
 12 |     For more information, please refer to this paper:
 13 | 
 14 |     Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, Yoav Artzi,
 15 |     "BERTScore: Evaluating Text Generation with BERT," arXiv:1904.09675 (2019). [`arXiv
 16 |     <https://arxiv.org/abs/1904.09675>`_]
 17 | 
 18 |     """
 19 |     def __init__(
 20 |             self,
 21 |             model=None,
 22 |             tokenizer=None,
 23 |             max_length=48,
 24 |             nbencodinglayers=4,
 25 |             device='cpu'
 26 |     ):
 27 |         """ It is the class that compute the BERTScores between sentences.
 28 | 
 29 |         :param model: BERT model (default: None, with model `bert-base-uncase` to be used)
 30 |         :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used)
 31 |         :param max_length: maximum number of tokens of each sentence (default: 48)
 32 |         :param nbencodinglayers: number of encoding layers (taking the last layers to encode the sentences, default: 4)
 33 |         :param device: device the language model is stored (default: `cpu`)
 34 |         :type model: str
 35 |         :type tokenizer: str
 36 |         :type max_length: int
 37 |         :type device: str
 38 |         """
 39 |         self.encoder = WrappedBERTEncoder(
 40 |             model=model,
 41 |             tokenizer=tokenizer,
 42 |             max_length=max_length,
 43 |             nbencodinglayers=nbencodinglayers,
 44 |             device=device)
 45 |         self.device = self.encoder.device
 46 |         self.cosine_fcn = torch.nn.CosineSimilarity(dim=0).to(self.device)
 47 | 
 48 |     def compute_matrix(self, sentence_a, sentence_b):
 49 |         """ Compute the table of similarities between all pairs of tokens. This is used
 50 |         for calculating the BERTScores.
 51 | 
 52 |         :param sentence_a: first sentence
 53 |         :param sentence_b: second sentence
 54 |         :return: similarity matrix of between tokens in two sentences
 55 |         :type sentence_a: str
 56 |         :type sentence_b: str
 57 |         :rtype: numpy.ndarray
 58 |         """
 59 |         cos = self.cosine_fcn
 60 |         _, sentence_a_tokens_embeddings, sentence_a_tokens = self.encoder.encode_sentences([sentence_a])
 61 |         _, sentence_b_tokens_embeddings, sentence_b_tokens = self.encoder.encode_sentences([sentence_b])
 62 | 
 63 |         similarity_matrix = torch.zeros((len(sentence_a_tokens[0])-2, len(sentence_b_tokens[0])-2),
 64 |                                         device=self.device)
 65 | 
 66 |         for i, j in product(range(len(sentence_a_tokens[0])-2), range(len(sentence_b_tokens[0])-2)):
 67 |             similarity_matrix[i, j] = cos(sentence_a_tokens_embeddings[0][i+1],
 68 |                                           sentence_b_tokens_embeddings[0][j+1])
 69 | 
 70 |         return similarity_matrix
 71 | 
 72 |     def recall_bertscore(self, reference_sentence, test_sentence):
 73 |         """ Compute the recall BERTScore between two sentences.
 74 | 
 75 |         :param reference_sentence: reference sentence
 76 |         :param test_sentence: test sentence
 77 |         :return: recall BERTScore between the two sentences
 78 |         :type reference_sentence: str
 79 |         :type test_sentence: str
 80 |         :rtype: float
 81 |         """
 82 |         similarity_matrix = self.compute_matrix(reference_sentence, test_sentence)
 83 |         recall = torch.mean(torch.max(similarity_matrix, axis=1).values)
 84 |         return np.float(recall.detach().numpy())
 85 | 
 86 |     def precision_bertscore(self, reference_sentence, test_sentence):
 87 |         """ Compute the precision BERTScore between two sentences.
 88 | 
 89 |         :param reference_sentence: reference sentence
 90 |         :param test_sentence: test sentence
 91 |         :return: precision BERTScore between the two sentences
 92 |         :type reference_sentence: str
 93 |         :type test_sentence: str
 94 |         :rtype: float
 95 |         """
 96 |         similarity_matrix = self.compute_matrix(reference_sentence, test_sentence)
 97 |         precision = torch.mean(torch.max(similarity_matrix, axis=0).values)
 98 |         return np.float(precision.detach().numpy())
 99 | 
100 |     def f1score_bertscore(self, reference_sentence, test_sentence):
101 |         """ Compute the F1 BERTScore between two sentences.
102 | 
103 |         :param reference_sentence: reference sentence
104 |         :param test_sentence: test sentence
105 |         :return: F1 BERTScore between the two sentences
106 |         :type reference_sentence: str
107 |         :type test_sentence: str
108 |         :rtype: float
109 |         """
110 |         recall = self.recall_bertscore(reference_sentence, test_sentence)
111 |         precision = self.precision_bertscore(reference_sentence, test_sentence)
112 |         return 2*recall*precision/(recall+precision)
113 | 


--------------------------------------------------------------------------------
/shorttext/metrics/wasserstein/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .wordmoverdist import word_mover_distance_linprog, word_mover_distance


--------------------------------------------------------------------------------
/shorttext/metrics/wasserstein/wordmoverdist.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import product
 3 | import warnings
 4 | 
 5 | import numpy as np
 6 | from scipy.spatial.distance import euclidean
 7 | from scipy.sparse import csr_matrix
 8 | from scipy.optimize import linprog
 9 | 
10 | from ...utils.gensim_corpora import tokens_to_fracdict
11 | 
12 | 
13 | def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean):
14 |     """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class.
15 | 
16 |     Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
17 |     model has to be provided. The whole `scipy.optimize.Optimize` object is returned.
18 | 
19 |     Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
20 | 
21 |     :param first_sent_tokens: first list of tokens.
22 |     :param second_sent_tokens: second list of tokens.
23 |     :param wvmodel: word-embedding models.
24 |     :param distancefunc: distance function that takes two numpy ndarray.
25 |     :return: the whole result of the linear programming problem
26 |     :type first_sent_tokens: list
27 |     :type second_sent_tokens: list
28 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
29 |     :type distancefunc: function
30 |     :rtype: scipy.optimize.OptimizeResult
31 |     """
32 |     nb_tokens_first_sent = len(first_sent_tokens)
33 |     nb_tokens_second_sent = len(second_sent_tokens)
34 | 
35 |     all_tokens = list(set(first_sent_tokens+second_sent_tokens))
36 |     wordvecs = {token: wvmodel[token] for token in all_tokens}
37 | 
38 |     first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
39 |     second_sent_buckets = tokens_to_fracdict(second_sent_tokens)
40 | 
41 |     collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j
42 | 
43 |     # assigning T
44 |     T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent)
45 |     for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)):
46 |         T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]],
47 |                                                    wordvecs[second_sent_tokens[j]])
48 | 
49 |     # assigning Aeq and beq
50 |     Aeq = csr_matrix(
51 |         (nb_tokens_first_sent+nb_tokens_second_sent,
52 |          nb_tokens_first_sent*nb_tokens_second_sent)
53 |     )
54 |     beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent)
55 |     for i in range(nb_tokens_first_sent):
56 |         for j in range(nb_tokens_second_sent):
57 |             Aeq[i, collapsed_idx_func(i, j)] = 1.
58 |         beq[i] = first_sent_buckets[first_sent_tokens[i]]
59 |     for j in range(nb_tokens_second_sent):
60 |         for i in range(nb_tokens_first_sent):
61 |             Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1.
62 |         beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]]
63 | 
64 |     return linprog(T, A_eq=Aeq, b_eq=beq)
65 | 
66 | 
67 | def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
68 |     """ Compute the Word Mover's distance (WMD) between the two given lists of tokens.
69 | 
70 |     Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding
71 |     model has to be provided. WMD is returned.
72 | 
73 |     Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
74 | 
75 |     :param first_sent_tokens: first list of tokens.
76 |     :param second_sent_tokens: second list of tokens.
77 |     :param wvmodel: word-embedding models.
78 |     :param distancefunc: distance function that takes two numpy ndarray.
79 |     :param lpFile: deprecated, kept for backward incompatibility. (default: None)
80 |     :return: Word Mover's distance (WMD)
81 |     :type first_sent_tokens: list
82 |     :type second_sent_tokens: list
83 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
84 |     :type distancefunc: function
85 |     :type lpFile: str
86 |     :rtype: float
87 |     """
88 |     linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel,
89 |                                                  distancefunc=distancefunc)
90 |     if lpFile is not None:
91 |         warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \
92 |                       'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \
93 |                       'this parameter.')
94 |     return linprog_result['fun']
95 | 


--------------------------------------------------------------------------------
/shorttext/smartload.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .utils import standard_text_preprocessor_1
 3 | from .utils import compactmodel_io as cio
 4 | from .utils import classification_exceptions as e
 5 | from .utils import load_DocumentTermMatrix
 6 | from .classifiers import  load_varnnlibvec_classifier, load_sumword2vec_classifier
 7 | from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel
 8 | from .generators import loadSeq2SeqWithKeras, loadCharBasedSeq2SeqGenerator
 9 | from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier
10 | from .classifiers import load_maxent_classifier
11 | from .spell import loadSCRNNSpellCorrector
12 | 
13 | 
14 | def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None):
15 |     """ Load appropriate classifier or model from the binary model.
16 | 
17 |     The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed.
18 | 
19 |     :param filename: path of the compact model file
20 |     :param wvmodel: Word2Vec model
21 |     :param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
22 |     :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
23 |     :return: appropriate classifier or model
24 |     :raise: AlgorithmNotExistException
25 |     :type filename: str
26 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
27 |     :type preprocessor: function
28 |     :type vecsize: int
29 |     """
30 |     classifier_name = cio.get_model_classifier_name(filename)
31 |     if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']:
32 |         return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True)
33 |     elif classifier_name in ['kerasautoencoder']:
34 |         return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True)
35 |     elif classifier_name in ['topic_sklearn']:
36 |         topicmodel = cio.get_model_config_field(filename, 'topicmodel')
37 |         if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']:
38 |             return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True)
39 |         elif topicmodel in ['kerasautoencoder']:
40 |             return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True)
41 |         else:
42 |             raise e.AlgorithmNotExistException(topicmodel)
43 |     elif classifier_name in ['nnlibvec']:
44 |         return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize)
45 |     elif classifier_name in ['sumvec']:
46 |         return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize)
47 |     elif classifier_name in ['maxent']:
48 |         return load_maxent_classifier(filename, compact=True)
49 |     elif classifier_name in ['dtm']:
50 |         return load_DocumentTermMatrix(filename, compact=True)
51 |     elif classifier_name in ['kerasseq2seq']:
52 |         return loadSeq2SeqWithKeras(filename, compact=True)
53 |     elif classifier_name in ['charbases2s']:
54 |         return loadCharBasedSeq2SeqGenerator(filename, compact=True)
55 |     elif classifier_name in ['scrnn_spell']:
56 |         return loadSCRNNSpellCorrector(filename, compact=True)
57 |     else:
58 |         raise e.AlgorithmNotExistException(classifier_name)


--------------------------------------------------------------------------------
/shorttext/spell/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .basespellcorrector import SpellCorrector
3 | 
4 | from .norvig import NorvigSpellCorrector
5 | from .sakaguchi import SCRNNSpellCorrector, loadSCRNNSpellCorrector
6 | 
7 | 


--------------------------------------------------------------------------------
/shorttext/spell/basespellcorrector.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from ..utils.classification_exceptions import NotImplementedException
 5 | 
 6 | 
 7 | class SpellCorrector(ABC):
 8 |     """ Base class for all spell corrector.
 9 | 
10 |     This class is not implemented; this is an "abstract class."
11 | 
12 |     """
13 |     @abstractmethod
14 |     def train(self, text):
15 |         """ Train the spell corrector with the given corpus.
16 | 
17 |         :param text: training corpus
18 |         :type text: str
19 |         """
20 |         raise NotImplementedException()
21 | 
22 |     @abstractmethod
23 |     def correct(self, word):
24 |         """ Recommend a spell correction to given the word.
25 | 
26 |         :param word: word to be checked
27 |         :return: recommended correction
28 |         :type word: str
29 |         :rtype: str
30 |         """
31 |         return word
32 | 


--------------------------------------------------------------------------------
/shorttext/spell/editor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numba as nb
 3 | 
 4 | 
 5 | @nb.njit
 6 | def compute_set_edits1(word):
 7 |     letters = 'abcdefghijklmnopqrstuvwxyz'
 8 | 
 9 |     splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
10 |     deletes = [L + R[1:] for L, R in splits if R]
11 |     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
12 |     replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
13 |     inserts = [L + c + R for L, R in splits for c in letters]
14 | 
15 |     returned_set = set(deletes + transposes + replaces + inserts)
16 | 
17 |     return returned_set
18 | 
19 | 
20 | @nb.njit
21 | def compute_set_edits2(word):
22 |     return (e2 for e1 in compute_set_edits1(word) for e2 in compute_set_edits1(e1))
23 | 


--------------------------------------------------------------------------------
/shorttext/spell/norvig.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # reference: https://norvig.com/spell-correct.html
 3 | 
 4 | import re
 5 | from collections import Counter
 6 | 
 7 | from . import SpellCorrector
 8 | from .editor import compute_set_edits1, compute_set_edits2
 9 | 
10 | 
11 | class NorvigSpellCorrector(SpellCorrector):
12 |     """ Spell corrector described by Peter Norvig in his blog. (https://norvig.com/spell-correct.html)
13 | 
14 |     """
15 |     def __init__(self):
16 |         """ Instantiate the class
17 | 
18 |         """
19 |         self.train('')
20 | 
21 |     def train(self, text):
22 |         """ Given the text, train the spell corrector.
23 | 
24 |         :param text: training corpus
25 |         :type text: str
26 |         """
27 |         self.words = re.findall('\\w+', text.lower())
28 |         self.WORDS = Counter(self.words)
29 |         self.N = sum(self.WORDS.values())
30 | 
31 |     def P(self, word):
32 |         """ Compute the probability of the words randomly sampled from the training corpus.
33 | 
34 |         :param word: a word
35 |         :return: probability of the word sampled randomly in the corpus
36 |         :type word: str
37 |         :rtype: float
38 |         """
39 |         return self.WORDS[word] / float(self.N)
40 | 
41 |     def correct(self, word):
42 |         """ Recommend a spelling correction to the given word
43 | 
44 |         :param word: a word
45 |         :return: recommended correction
46 |         :type word: str
47 |         :rtype: str
48 |         """
49 |         return max(self.candidates(word), key=self.P)
50 | 
51 |     def known(self, words):
52 |         """ Filter away the words that are not found in the training corpus.
53 | 
54 |         :param words: list of words
55 |         :return: list of words that can be found in the training corpus
56 |         :type words: list
57 |         :rtype: list
58 |         """
59 |         return set(w for w in words if w in self.WORDS)
60 | 
61 |     def candidates(self, word):
62 |         """ List potential candidates for corrected spelling to the given words.
63 | 
64 |         :param word: a word
65 |         :return: list of recommended corrections
66 |         :type word: str
67 |         :rtype: list
68 |         """
69 |         return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word])
70 | 
71 | 


--------------------------------------------------------------------------------
/shorttext/stack/__init__.py:
--------------------------------------------------------------------------------
1 | from .stacking import StackedGeneralization, LogisticStackedGeneralization


--------------------------------------------------------------------------------
/shorttext/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from . import misc
 3 | from . import kerasmodel_io
 4 | from . import classification_exceptions
 5 | from . import gensim_corpora
 6 | from . import textpreprocessing
 7 | from . import compactmodel_io
 8 | from . import dtm
 9 | 
10 | from .textpreprocessing import tokenize, stemword
11 | from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1, standard_text_preprocessor_2
12 | 
13 | from .wordembed import load_word2vec_model, load_fasttext_model, load_poincare_model, shorttext_to_avgvec
14 | from .wordembed import RESTfulKeyedVectors
15 | from .dtm import load_DocumentTermMatrix
16 | 
17 | from .dtm import DocumentTermMatrix, load_DocumentTermMatrix
18 | 
19 | from .transformers import WrappedBERTEncoder
20 | 
21 | 


--------------------------------------------------------------------------------
/shorttext/utils/classification_exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | class ModelNotTrainedException(Exception):
 6 |     def __init__(self):
 7 |         self.message = 'Model not trained.'
 8 | 
 9 | 
10 | class AlgorithmNotExistException(Exception):
11 |     def __init__(self, algoname):
12 |         self.message = 'Algorithm '+algoname+' not exist.'
13 | 
14 | 
15 | class WordEmbeddingModelNotExistException(Exception):
16 |     def __init__(self, path):
17 |         self.message = 'Given path of the word-embedding model not exist: '+path
18 | 
19 | 
20 | class UnequalArrayLengthsException(Exception):
21 |     def __init__(self, arr1, arr2):
22 |         self.message = 'Unequal lengths: '+str(len(arr1))+" and "+str(len(arr2))
23 | 
24 | 
25 | class NotImplementedException(Exception):
26 |     def __init__(self):
27 |         self.message = 'Method not implemented.'
28 | 
29 | 
30 | class IncorrectClassificationModelFileException(Exception):
31 |     def __init__(self, expectedname, actualname):
32 |         self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')'
33 | 
34 | 
35 | class OperationNotDefinedException(Exception):
36 |     def __init__(self, opname):
37 |         self.message = 'Operation '+opname+' not defined'
38 | 


--------------------------------------------------------------------------------
/shorttext/utils/dtm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from gensim.corpora import Dictionary
  4 | from gensim.models import TfidfModel
  5 | from scipy.sparse import dok_matrix
  6 | 
  7 | import pickle
  8 | 
  9 | from .compactmodel_io import CompactIOMachine
 10 | from .classification_exceptions import NotImplementedException
 11 | 
 12 | 
 13 | dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl']
 14 | 
 15 | class DocumentTermMatrix(CompactIOMachine):
 16 |     """ Document-term matrix for corpus.
 17 | 
 18 |     This is a class that handles the document-term matrix (DTM). With a given corpus, users can
 19 |     retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf
 20 |     can be applied.
 21 |     """
 22 |     def __init__(self, corpus, docids=None, tfidf=False):
 23 |         """ Initialize the document-term matrix (DTM) class with a given corpus.
 24 | 
 25 |         If document IDs (docids) are given, it will be stored and output as approrpriate.
 26 |         If not, the documents are indexed by numbers.
 27 | 
 28 |         Users can choose to weigh by tf-idf. The default is not to weigh.
 29 | 
 30 |         The corpus has to be a list of lists, with each of the inside list contains all the tokens
 31 |         in each document.
 32 | 
 33 |         :param corpus: corpus.
 34 |         :param docids: list of designated document IDs. (Default: None)
 35 |         :param tfidf: whether to weigh using tf-idf. (Default: False)
 36 |         :type corpus: list
 37 |         :type docids: list
 38 |         :type tfidf: bool
 39 |         """
 40 |         CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices)
 41 |         if docids == None:
 42 |             self.docid_dict = {i: i for i in range(len(corpus))}
 43 |             self.docids = range(len(corpus))
 44 |         else:
 45 |             if len(docids) == len(corpus):
 46 |                 self.docid_dict = {docid: i for i, docid in enumerate(docids)}
 47 |                 self.docids = docids
 48 |             elif len(docids) > len(corpus):
 49 |                 self.docid_dict = {docid: i for i, docid in zip(range(len(corpus)), docids[:len(corpus)])}
 50 |                 self.docids = docids[:len(corpus)]
 51 |             else:
 52 |                 self.docid_dict = {docid: i for i, docid in enumerate(docids)}
 53 |                 self.docid_dict = {i: i for i in range(len(docids), range(corpus))}
 54 |                 self.docids = docids + range(len(docids), range(corpus))
 55 |         # generate DTM
 56 |         self.generate_dtm(corpus, tfidf=tfidf)
 57 | 
 58 |     def generate_dtm(self, corpus, tfidf=False):
 59 |         """ Generate the inside document-term matrix and other peripherical information
 60 |         objects. This is run when the class is instantiated.
 61 | 
 62 |         :param corpus: corpus.
 63 |         :param tfidf: whether to weigh using tf-idf. (Default: False)
 64 |         :return: None
 65 |         :type corpus: list
 66 |         :type tfidf: bool
 67 |         """
 68 |         self.dictionary = Dictionary(corpus)
 69 |         self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float_)
 70 |         bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
 71 |         if tfidf:
 72 |             weighted_model = TfidfModel(bow_corpus)
 73 |             bow_corpus = weighted_model[bow_corpus]
 74 |         for docid in self.docids:
 75 |             for tokenid, count in bow_corpus[self.docid_dict[docid]]:
 76 |                 self.dtm[self.docid_dict[docid], tokenid] = count
 77 | 
 78 |     def get_termfreq(self, docid, token):
 79 |         """ Retrieve the term frequency of a given token in a particular document.
 80 | 
 81 |         Given a token and a particular document ID, compute the term frequency for this
 82 |         token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted
 83 |         term frequency.
 84 | 
 85 |         :param docid: document ID
 86 |         :param token: term or token
 87 |         :return: term frequency or weighted term frequency of the given token in this document (designated by docid)
 88 |         :type docid: any
 89 |         :type token: str
 90 |         :rtype: numpy.float
 91 |         """
 92 |         return self.dtm[self.docid_dict[docid], self.dictionary.token2id[token]]
 93 | 
 94 |     def get_total_termfreq(self, token):
 95 |         """ Retrieve the total occurrences of the given token.
 96 | 
 97 |         Compute the total occurrences of the term in all documents. If `tfidf` is set to `True`
 98 |         while instantiating the class, it returns the sum of weighted term frequency.
 99 | 
100 |         :param token: term or token
101 |         :return: total occurrences of the given token
102 |         :type token: str
103 |         :rtype: numpy.float
104 |         """
105 |         return sum(self.dtm[:, self.dictionary.token2id[token]].values())
106 | 
107 |     def get_doc_frequency(self, token):
108 |         """ Retrieve the document frequency of the given token.
109 | 
110 |         Compute the document frequency of the given token, i.e., the number of documents
111 |         that this token can be found.
112 | 
113 |         :param token: term or token
114 |         :return: document frequency of the given token
115 |         :type token: str
116 |         :rtype: int
117 |         """
118 |         return len(self.dtm[:, self.dictionary.token2id[token]].values())
119 | 
120 |     def get_token_occurences(self, token):
121 |         """ Retrieve the term frequencies of a given token in all documents.
122 | 
123 |         Compute the term frequencies of the given token for all the documents. If `tfidf` is
124 |         set to be `True` while instantiating the class, it returns the weighted term frequencies.
125 | 
126 |         This method returns a dictionary of term frequencies with the corresponding document IDs
127 |         as the keys.
128 | 
129 |         :param token: term or token
130 |         :return: a dictionary of term frequencies with the corresponding document IDs as the keys
131 |         :type token: str
132 |         :rtype: dict
133 |         """
134 |         return {self.docids[docidx]: count for (docidx, _), count in self.dtm[:, self.dictionary.token2id[token]].items()}
135 | 
136 |     def get_doc_tokens(self, docid):
137 |         """ Retrieve the term frequencies of all tokens in the given document.
138 | 
139 |         Compute the term frequencies of all tokens for the given document. If `tfidf` is
140 |         set to be `True` while instantiating the class, it returns the weighted term frequencies.
141 | 
142 |         This method returns a dictionary of term frequencies with the tokens as the keys.
143 | 
144 |         :param docid: document ID
145 |         :return: a dictionary of term frequencies with the tokens as the keys
146 |         :type docid: any
147 |         :rtype: dict
148 |         """
149 |         return {self.dictionary[tokenid]: count for (_, tokenid), count in self.dtm[self.docid_dict[docid], :].items()}
150 | 
151 |     def generate_dtm_dataframe(self):
152 |         """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3)
153 | 
154 |         Now it raises exception.
155 | 
156 |         :return: data frame of the document-term matrix
157 |         :rtype: pandas.DataFrame
158 |         :raise: NotImplementedException
159 |         """
160 |         raise NotImplementedException()
161 | 
162 |     def savemodel(self, prefix):
163 |         """ Save the model.
164 | 
165 |         :param prefix: prefix of the files
166 |         :return: None
167 |         :type prefix: str
168 |         """
169 |         pickle.dump(self.docids, open(prefix+'_docids.pkl', 'wb'))
170 |         self.dictionary.save(prefix+'_dictionary.dict')
171 |         pickle.dump(self.dtm, open(prefix+'_dtm.pkl', 'wb'))
172 | 
173 |     def loadmodel(self, prefix):
174 |         """ Load the model.
175 | 
176 |         :param prefix: prefix of the files
177 |         :return: None
178 |         :type prefix: str
179 |         """
180 |         self.docids = pickle.load(open(prefix+'_docids.pkl', 'rb'))
181 |         self.docid_dict = {docid: i for i, docid in enumerate(self.docids)}
182 |         self.dictionary = Dictionary.load(prefix+'_dictionary.dict')
183 |         self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb'))
184 | 
185 | 
186 | def load_DocumentTermMatrix(filename, compact=True):
187 |     """ Load presaved Document-Term Matrix (DTM).
188 | 
189 |     Given the file name (if `compact` is `True`) or the prefix (if `compact` is `False`),
190 |     return the document-term matrix.
191 | 
192 |     :param filename: file name or prefix
193 |     :param compact: whether it is a compact model. (Default: `True`)
194 |     :return: document-term matrix
195 |     :type filename: str
196 |     :type compact: bool
197 |     :rtype: DocumentTermMatrix
198 |     """
199 |     dtm = DocumentTermMatrix([[]])
200 |     if compact:
201 |         dtm.load_compact_model(filename)
202 |     else:
203 |         dtm.loadmodel(filename)
204 |     return dtm


--------------------------------------------------------------------------------
/shorttext/utils/gensim_corpora.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from collections import defaultdict
 3 | 
 4 | import gensim
 5 | 
 6 | from .textpreprocessing import tokenize
 7 | 
 8 | 
 9 | def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize):
10 |     """ Generate gensim bag-of-words dictionary and corpus.
11 | 
12 |     Given a text data, a dict with keys being the class labels, and the values
13 |     being the list of short texts, in the same format output by `shorttext.data.data_retrieval`,
14 |     return a gensim dictionary and corpus.
15 | 
16 |     :param classdict: text data, a dict with keys being the class labels, and each value is a list of short texts
17 |     :param proprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`)
18 |     :return: a tuple, consisting of a gensim dictionary, a corpus, and a list of class labels
19 |     :type classdict: dict
20 |     :type proprocess_and_tokenize: function
21 |     :rtype: (gensim.corpora.Dictionary, list, list)
22 |     """
23 |     classlabels = sorted(classdict.keys())
24 |     doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels]
25 |     dictionary = gensim.corpora.Dictionary(doc)
26 |     corpus = [dictionary.doc2bow(doctokens) for doctokens in doc]
27 |     return dictionary, corpus, classlabels
28 | 
29 | 
30 | def save_corpus(dictionary, corpus, prefix):
31 |     """ Save gensim corpus and dictionary.
32 | 
33 |     :param dictionary: dictionary to save
34 |     :param corpus: corpus to save
35 |     :param prefix: prefix of the files to save
36 |     :return: None
37 |     :type dictionary: gensim.corpora.Dictionary
38 |     :type corpus: list
39 |     :type prefix: str
40 |     """
41 |     dictionary.save(prefix+'_dictionary.dict')
42 |     gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus)
43 | 
44 | 
45 | def load_corpus(prefix):
46 |     """ Load gensim corpus and dictionary.
47 | 
48 |     :param prefix: prefix of the file to load
49 |     :return: corpus and dictionary
50 |     :type prefix: str
51 |     :rtype: tuple
52 |     """
53 |     corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm')
54 |     dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict')
55 |     return corpus, dictionary
56 | 
57 | 
58 | def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokenize=tokenize):
59 |     """ Update corpus with additional training data.
60 |     
61 |     With the additional training data, the dictionary and corpus are updated.
62 |     
63 |     :param dictionary: original dictionary
64 |     :param corpus: original corpus
65 |     :param newclassdict: additional training data
66 |     :param preprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`)
67 |     :return: a tuple, an updated corpus, and the new corpus (for updating model)
68 |     :type dictionary: gensim.corpora.Dictionary
69 |     :type corpus: list
70 |     :type newclassdict: dict
71 |     :type preprocess_and_tokenize: function
72 |     :rtype: tuple
73 |     """
74 | 
75 |     newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())]
76 |     newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc]
77 |     corpus += newcorpus
78 | 
79 |     return corpus, newcorpus
80 | 
81 | 
82 | def tokens_to_fracdict(tokens):
83 |     """ Return normalized bag-of-words (BOW) vectors.
84 | 
85 |     :param tokens: list of tokens.
86 |     :type tokens: list
87 |     :return: normalized vectors of counts of tokens as a `dict`
88 |     :rtype: dict
89 |     """
90 |     cntdict = defaultdict(lambda : 0)
91 |     for token in tokens:
92 |         cntdict[token] += 1
93 |     totalcnt = sum(cntdict.values())
94 |     return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}


--------------------------------------------------------------------------------
/shorttext/utils/kerasmodel_io.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tensorflow.keras.models import model_from_json
 3 | 
 4 | 
 5 | def save_model(nameprefix, model):
 6 |     """ Save a keras sequential model into files.
 7 | 
 8 |     Given a keras sequential model, save the model with the given file path prefix.
 9 |     It saves the model into a JSON file, and an HDF5 file (.h5).
10 | 
11 |     :param nameprefix: Prefix of the paths of the model files
12 |     :param model: keras sequential model to be saved
13 |     :return: None
14 |     :type nameprefix: str
15 |     :type model: keras.models.Model
16 |     """
17 |     model_json = model.to_json()
18 |     open(nameprefix+'.json', 'w').write(model_json)
19 |     model.save_weights(nameprefix+'.weights.h5')
20 | 
21 | 
22 | def load_model(nameprefix):
23 |     """ Load a keras sequential model from files.
24 | 
25 |     Given the prefix of the file paths, load a keras sequential model from
26 |     a JSON file and an HDF5 file.
27 | 
28 |     :param nameprefix: Prefix of the paths of the model files
29 |     :return: keras sequential model
30 |     :type nameprefix: str
31 |     :rtype: keras.models.Model
32 |     """
33 |     model = model_from_json(open(nameprefix+'.json', 'r').read())
34 |     model.load_weights(nameprefix+'.weights.h5')
35 |     return model


--------------------------------------------------------------------------------
/shorttext/utils/misc.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def textfile_generator(textfile, linebreak=True, encoding=None):
 4 |     """ Return a generator that reads lines in a text file.
 5 | 
 6 |     :param textfile: file object of a text file
 7 |     :param linebreak: whether to return a line break at the end of each line (Default: True)
 8 |     :param encoding: encoding of the text file (Default: None)
 9 |     :return: a generator that reads lines in a text file
10 |     :type textfile: file
11 |     :type linebreak: bool
12 |     :type encoding: str
13 |     :rtype: generator
14 |     """
15 |     for t in textfile:
16 |         if len(t) > 0:
17 |             if encoding is None:
18 |                 yield t.strip() + ('\n' if linebreak else '')
19 |             else:
20 |                 yield t.decode(encoding).strip() + ('\n' if linebreak else '')
21 | 
22 | 
23 | class SinglePoolExecutor:
24 |     """ It is a wrapper for Python `map` functions.
25 | 
26 |     """
27 |     def map(self, func, *iterables):
28 |         """ Refer to Python `map` documentation.
29 | 
30 |         :param func: function
31 |         :param iterables: iterables to loop
32 |         :return: generator for the map
33 |         :type func: function
34 |         :type iterables: iterables
35 |         :rtype: map
36 |         """
37 |         return map(func, *iterables)
38 | 


--------------------------------------------------------------------------------
/shorttext/utils/textpreprocessing.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import os
  4 | import codecs
  5 | 
  6 | import snowballstemmer
  7 | 
  8 | # tokenizer
  9 | def tokenize(s: str) -> list[str]:
 10 |     return s.split(' ')
 11 | 
 12 | 
 13 | # stemmer
 14 | class StemmerSingleton:
 15 |     def __new__(cls):
 16 |         if not hasattr(cls, 'instance'):
 17 |             cls.instance = super(StemmerSingleton, cls).__new__(cls)
 18 |             cls.stemmer = snowballstemmer.stemmer('english')
 19 |         return cls.instance
 20 | 
 21 |     def __call__(cls, s: str) -> str:
 22 |         return cls.stemmer.stemWord(s)
 23 | 
 24 | def stemword(s: str) -> str:
 25 |     return StemmerSingleton()(s)
 26 | 
 27 | 
 28 | def preprocess_text(text, pipeline):
 29 |     """ Preprocess the text according to the given pipeline.
 30 | 
 31 |     Given the pipeline, which is a list of functions that process an
 32 |     input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
 33 |     preprocess the text.
 34 | 
 35 |     :param text: text to be preprocessed
 36 |     :param pipeline: a list of functions that convert a text to another text
 37 |     :return: preprocessed text
 38 |     :type text: str
 39 |     :type pipeline: list
 40 |     :rtype: str
 41 |     """
 42 |     return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
 43 | 
 44 | 
 45 | def text_preprocessor(pipeline):
 46 |     """ Return the function that preprocesses text according to the pipeline.
 47 | 
 48 |     Given the pipeline, which is a list of functions that process an
 49 |     input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
 50 |     return a function that preprocesses an input text outlined by the pipeline, essentially
 51 |     a function that runs :func:`~preprocess_text` with the specified pipeline.
 52 | 
 53 |     :param pipeline: a list of functions that convert a text to another text
 54 |     :return: a function that preprocesses text according to the pipeline
 55 |     :type pipeline: list
 56 |     :rtype: function
 57 |     """
 58 |     return lambda text: preprocess_text(text, pipeline)
 59 | 
 60 | 
 61 | def oldschool_standard_text_preprocessor(stopwordsfile):
 62 |     """ Return a commonly used text preprocessor.
 63 | 
 64 |     Return a text preprocessor that is commonly used, with the following steps:
 65 | 
 66 |     - removing special characters,
 67 |     - removing numerals,
 68 |     - converting all alphabets to lower cases,
 69 |     - removing stop words, and
 70 |     - stemming the words (using Porter stemmer).
 71 | 
 72 |     This function calls :func:`~text_preprocessor`.
 73 | 
 74 |     :param stopwordsfile: file object of the list of stop words
 75 |     :type stopwordsfile: file
 76 |     :return: a function that preprocesses text according to the pipeline
 77 |     :rtype: function
 78 |     """
 79 |     # load stop words file
 80 |     stopwordset = set([stopword.strip() for stopword in stopwordsfile])
 81 |     stopwordsfile.close()
 82 | 
 83 |     # the pipeline
 84 |     pipeline = [lambda s: re.sub('[^\w\s]', '', s),
 85 |                 lambda s: re.sub('[\d]', '', s),
 86 |                 lambda s: s.lower(),
 87 |                 lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))),
 88 |                 lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
 89 |                ]
 90 |     return text_preprocessor(pipeline)
 91 | 
 92 | 
 93 | def standard_text_preprocessor_1():
 94 |     """ Return a commonly used text preprocessor.
 95 | 
 96 |     Return a text preprocessor that is commonly used, with the following steps:
 97 | 
 98 |     - removing special characters,
 99 |     - removing numerals,
100 |     - converting all alphabets to lower cases,
101 |     - removing stop words (NLTK list), and
102 |     - stemming the words (using Porter stemmer).
103 | 
104 |     This function calls :func:`~oldschool_standard_text_preprocessor`.
105 | 
106 |     :return: a function that preprocesses text according to the pipeline
107 |     :rtype: function
108 |     """
109 |     # load stop words
110 |     this_dir, _ = os.path.split(__file__)
111 |     stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8')
112 | 
113 |     return oldschool_standard_text_preprocessor(stopwordsfile)
114 | 
115 | 
116 | def standard_text_preprocessor_2():
117 |     """ Return a commonly used text preprocessor.
118 | 
119 |     Return a text preprocessor that is commonly used, with the following steps:
120 | 
121 |     - removing special characters,
122 |     - removing numerals,
123 |     - converting all alphabets to lower cases,
124 |     - removing stop words (NLTK list minus negation terms), and
125 |     - stemming the words (using Porter stemmer).
126 | 
127 |     This function calls :func:`~oldschool_standard_text_preprocessor`.
128 | 
129 |     :return: a function that preprocesses text according to the pipeline
130 |     :rtype: function
131 |     """
132 |     # load stop words
133 |     this_dir, _ = os.path.split(__file__)
134 |     stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
135 | 
136 |     return oldschool_standard_text_preprocessor(stopwordsfile)
137 | 


--------------------------------------------------------------------------------
/shorttext/utils/transformers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # reference: https://towardsdatascience.com/word-embeddings-in-2020-review-with-code-examples-11eb39a1ee6d
  3 | 
  4 | import warnings
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from transformers import BertTokenizer, BertModel
  9 | 
 10 | 
 11 | class BERTObject:
 12 |     """ The base class for BERT model that contains the embedding model and the tokenizer.
 13 | 
 14 |     For more information, please refer to the following paper:
 15 | 
 16 |     Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv
 17 |     <https://arxiv.org/abs/1810.04805>`_]
 18 | 
 19 |     """
 20 |     def __init__(self, model=None, tokenizer=None, trainable=False, device='cpu'):
 21 |         """ The base class for BERT model that contains the embedding model and the tokenizer.
 22 | 
 23 |         :param model: BERT model (default: None, with model `bert-base-uncase` to be used)
 24 |         :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used)
 25 |         :param device: device the language model is stored (default: `cpu`)
 26 |         :type model: str
 27 |         :type tokenizer: str
 28 |         :type device: str
 29 |         """
 30 |         if device == 'cuda':
 31 |             if torch.cuda.is_available():
 32 |                 self.device = torch.device('cuda')
 33 |             else:
 34 |                 warnings.warn("CUDA is not available. Device set to 'cpu'.")
 35 |                 self.device = torch.device('cpu')
 36 |         else:
 37 |             self.device = torch.device(device)
 38 | 
 39 |         self.trainable = trainable
 40 | 
 41 |         if model is None:
 42 |             self.model = BertModel.from_pretrained('bert-base-uncased',
 43 |                                                    output_hidden_states=True)\
 44 |                             .to(self.device)
 45 |         else:
 46 |             self.model = model.to(self.device)
 47 | 
 48 |         if self.trainable:
 49 |             self.model.train()
 50 | 
 51 |         if tokenizer is None:
 52 |             self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
 53 |         else:
 54 |             self.tokenizer = tokenizer
 55 | 
 56 |         self.number_hidden_layers = self.model.config.num_hidden_layers
 57 | 
 58 | 
 59 | class WrappedBERTEncoder(BERTObject):
 60 |     """ This is the class that encodes sentences with BERT models.
 61 | 
 62 |     For more information, please refer to the following paper:
 63 | 
 64 |     Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova, "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," arXiv:1810.04805 (2018). [`arXiv
 65 |     <https://arxiv.org/abs/1810.04805>`_]
 66 | 
 67 |     """
 68 |     def __init__(
 69 |             self,
 70 |             model=None,
 71 |             tokenizer=None,
 72 |             max_length=48,
 73 |             nbencodinglayers=4,
 74 |             trainable=False,
 75 |             device='cpu'
 76 |     ):
 77 |         """ This is the constructor of the class that encodes sentences with BERT models.
 78 | 
 79 |         :param model: BERT model (default: None, with model `bert-base-uncase` to be used)
 80 |         :param tokenizer: BERT tokenizer (default: None, with model `bert-base-uncase` to be used)
 81 |         :param max_length: maximum number of tokens of each sentence (default: 48)
 82 |         :param nbencodinglayers: number of encoding layers (taking the last layers to encode the sentences, default: 4)
 83 |         :param device: device the language model is stored (default: `cpu`)
 84 |         :type model: str
 85 |         :type tokenizer: str
 86 |         :type max_length: int
 87 |         :type device: str
 88 |         """
 89 |         super(WrappedBERTEncoder, self).__init__(
 90 |             model=model,
 91 |             tokenizer=tokenizer,
 92 |             trainable=trainable,
 93 |             device=device
 94 |         )
 95 |         self.max_length = max_length
 96 |         self.nbencodinglayers = nbencodinglayers
 97 | 
 98 |     def encode_sentences(self, sentences, numpy=False):
 99 |         """ Encode the sentences into numerical vectors, given by a list of strings.
100 | 
101 |         It can output either torch tensors or numpy arrays.
102 | 
103 |         :param sentences: list of strings to encode
104 |         :param numpy: output a numpy array if `True`; otherwise, output a torch tensor. (Default: `False`)
105 |         :return: encoded vectors for the sentences
106 |         :type sentences: list
107 |         :type numpy: bool
108 |         :rtype: numpy.array or torch.Tensor
109 |         """
110 |         input_ids = []
111 |         tokenized_texts = []
112 | 
113 |         for sentence in sentences:
114 |             marked_text = '[CLS]' + sentence + '[SEP]'
115 | 
116 |             encoded_dict = self.tokenizer.encode_plus(
117 |                 sentence,
118 |                 add_special_tokens=True,
119 |                 truncation=True,
120 |                 max_length=self.max_length,
121 |                 padding='max_length',
122 |                 return_tensors='pt'
123 |             )
124 | 
125 |             tokenized_texts.append(self.tokenizer.tokenize(marked_text))
126 |             input_ids.append(encoded_dict['input_ids'])
127 | 
128 |         input_ids = torch.cat(input_ids, dim=0)
129 |         segments_id = torch.LongTensor(np.array(input_ids > 0))
130 |         input_ids = input_ids.to(self.device)
131 |         segments_id = segments_id.to(self.device)
132 | 
133 |         if self.trainable:
134 |             output = self.model(input_ids, segments_id)
135 |             sentences_embeddings = output[1]
136 |             hidden_state = output[2]
137 |         else:
138 |             with torch.no_grad():
139 |                 output = self.model(input_ids, segments_id)
140 |                 sentences_embeddings = output[1]
141 |                 hidden_state = output[2]
142 | 
143 |         alllayers_token_embeddings = torch.stack(hidden_state, dim=0)
144 |         alllayers_token_embeddings = alllayers_token_embeddings.permute(1, 2, 0, 3)  # swap dimensions to [sentence, tokens, hidden layers, features]
145 |         processed_embeddings = alllayers_token_embeddings[:, :, (self.number_hidden_layers+1-self.nbencodinglayers):, :]
146 | 
147 |         token_embeddings = torch.reshape(processed_embeddings, (len(sentences), self.max_length, -1))
148 | 
149 |         if numpy:
150 |             sentences_embeddings = sentences_embeddings.detach().numpy()
151 |             token_embeddings = token_embeddings.detach().numpy()
152 | 
153 |         return sentences_embeddings, token_embeddings, tokenized_texts
154 | 


--------------------------------------------------------------------------------
/shorttext/utils/wordembed.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import gensim
  4 | from gensim.models import KeyedVectors
  5 | from gensim.models.keyedvectors import KeyedVectors
  6 | from gensim.models.poincare import PoincareModel, PoincareKeyedVectors
  7 | import requests
  8 | 
  9 | from .textpreprocessing import tokenize
 10 | 
 11 | 
 12 | def load_word2vec_model(path, binary=True):
 13 |     """ Load a pre-trained Word2Vec model.
 14 | 
 15 |     :param path: path of the file of the pre-trained Word2Vec model
 16 |     :param binary: whether the file is in binary format (Default: True)
 17 |     :return: a pre-trained Word2Vec model
 18 |     :type path: str
 19 |     :type binary: bool
 20 |     :rtype: gensim.models.keyedvectors.KeyedVectors
 21 |     """
 22 |     return KeyedVectors.load_word2vec_format(path, binary=binary)
 23 | 
 24 | 
 25 | def load_fasttext_model(path, encoding='utf-8'):
 26 |     """ Load a pre-trained FastText model.
 27 | 
 28 |     :param path: path of the file of the pre-trained FastText model
 29 |     :return: a pre-trained FastText model
 30 |     :type path: str
 31 |     :rtype: gensim.models.keyedvectors.FastTextKeyedVectors
 32 |     """
 33 |     return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding)
 34 | 
 35 | 
 36 | def load_poincare_model(path, word2vec_format=True, binary=False):
 37 |     """ Load a Poincare embedding model.
 38 | 
 39 |     :param path: path of the file of the pre-trained Poincare embedding model
 40 |     :param word2vec_format: whether to load from word2vec format (default: True)
 41 |     :param binary: binary format (default: False)
 42 |     :return: a pre-trained Poincare embedding model
 43 |     :type path: str
 44 |     :type word2vec_format: bool
 45 |     :type binary: bool
 46 |     :rtype: gensim.models.poincare.PoincareKeyedVectors
 47 |     """
 48 |     if word2vec_format:
 49 |         return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
 50 |     else:
 51 |         return PoincareModel.load(path).kv
 52 | 
 53 | 
 54 | def shorttext_to_avgvec(shorttext, wvmodel):
 55 |     """ Convert the short text into an averaged embedded vector representation.
 56 | 
 57 |     Given a short sentence, it converts all the tokens into embedded vectors according to
 58 |     the given word-embedding model, sums
 59 |     them up, and normalize the resulting vector. It returns the resulting vector
 60 |     that represents this short sentence.
 61 | 
 62 |     :param shorttext: a short sentence
 63 |     :param wvmodel: word-embedding model
 64 |     :return: an embedded vector that represents the short sentence
 65 |     :type shorttext: str
 66 |     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
 67 |     :rtype: numpy.ndarray
 68 |     """
 69 |     vec = np.sum(
 70 |         [
 71 |             wvmodel[token]
 72 |             if token in wvmodel
 73 |             else np.array([1.]*wvmodel.vector_size) / np.sqrt(wvmodel.vector_size)
 74 |             for token in tokenize(shorttext)
 75 |         ],
 76 |         axis=0
 77 |     )
 78 | 
 79 |     # normalize
 80 |     norm = np.linalg.norm(vec)
 81 |     if norm != 0:
 82 |         vec /= norm
 83 | 
 84 |     return vec
 85 | 
 86 | 
 87 | class RESTfulKeyedVectors(KeyedVectors):
 88 |     """ RESTfulKeyedVectors, for connecting to the API of the preloaded word-embedding vectors loaded
 89 |         by `WordEmbedAPI`.
 90 | 
 91 |         This class inherits from :class:`gensim.models.keyedvectors.KeyedVectors`.
 92 | 
 93 |     """
 94 |     def __init__(self, url, port='5000'):
 95 |         """ Initialize the class.
 96 | 
 97 |         :param url: URL of the API, usually `http://localhost`
 98 |         :param port: Port number
 99 |         :type url: str
100 |         :type port: str
101 |         """
102 |         self.url = url
103 |         self.port = port
104 | 
105 |     def closer_than(self, entity1, entity2):
106 |         """
107 | 
108 |         :param entity1: word 1
109 |         :param entity2: word 2
110 |         :type entity1: str
111 |         :type entity2: str
112 |         :return: list of words
113 |         :rtype: list
114 |         """
115 |         r = requests.post(self.url + ':' + self.port + '/closerthan',
116 |                           json={'entity1': entity1, 'entity2': entity2})
117 |         return r.json()
118 | 
119 |     def distance(self, entity1, entity2):
120 |         """
121 | 
122 |         :param entity1: word 1
123 |         :param entity2: word 2
124 |         :type entity1: str
125 |         :type entity2: str
126 |         :return: distance between two words
127 |         :rtype: float
128 |         """
129 |         r = requests.post(self.url + ':' + self.port + '/distance',
130 |                           json={'entity1': entity1, 'entity2': entity2})
131 |         return r.json()['distance']
132 | 
133 |     def distances(self, entity1, other_entities=()):
134 |         """
135 | 
136 |         :param entity1: word
137 |         :param other_entities: list of words
138 |         :type entity1: str
139 |         :type other_entities: list
140 |         :return: list of distances between `entity1` and each word in `other_entities`
141 |         :rtype: list
142 |         """
143 |         r = requests.post(self.url + ':' + self.port + '/distances',
144 |                           json={'entity1': entity1, 'other_entities': other_entities})
145 |         return np.array(r.json()['distances'], dtype=np.float32)
146 | 
147 |     def get_vector(self, entity):
148 |         """
149 | 
150 |         :param entity: word
151 |         :type: str
152 |         :return: word vectors of the given word
153 |         :rtype: numpy.ndarray
154 |         """
155 |         r = requests.post(self.url + ':' + self.port + '/get_vector', json={'token': entity})
156 |         returned_dict = r.json()
157 |         if 'vector' in returned_dict:
158 |             return np.array(returned_dict['vector'])
159 |         else:
160 |             raise KeyError('The token {} does not exist in the model.'.format(entity))
161 | 
162 |     def most_similar(self, **kwargs):
163 |         """
164 | 
165 |         :param kwargs:
166 |         :return:
167 |         """
168 |         r = requests.post(self.url + ':' + self.port + '/most_similar', json=kwargs)
169 |         return [tuple(pair) for pair in r.json()]
170 | 
171 |     def most_similar_to_given(self, entity1, entities_list):
172 |         """
173 | 
174 |         :param entity1: word
175 |         :param entities_list: list of words
176 |         :type entity1: str
177 |         :type entities_list: list
178 |         :return: list of similarities between the given word and each word in `entities_list`
179 |         :rtype: list
180 |         """
181 |         r = requests.post(self.url + ':' + self.port + '/most_similar_to_given',
182 |                           json={'entity1': entity1, 'entities_list': entities_list})
183 |         return r.json()['token']
184 | 
185 |     def rank(self, entity1, entity2):
186 |         """
187 | 
188 |         :param entity1: word 1
189 |         :param entity2: word 2
190 |         :type entity1: str
191 |         :type entity2: str
192 |         :return: rank
193 |         :rtype: int
194 |         """
195 |         r = requests.post(self.url + ':' + self.port + '/rank',
196 |                           json={'entity1': entity1, 'entity2': entity2})
197 |         return r.json()['rank']
198 | 
199 |     def save(self, fname_or_handle, **kwargs):
200 |         """
201 | 
202 |         :param fname_or_handle:
203 |         :param kwargs:
204 |         :return:
205 |         """
206 |         raise IOError('The class RESTfulKeyedVectors do not persist models to a file.')
207 | 
208 |     def similarity(self, entity1, entity2):
209 |         """
210 | 
211 |         :param entity1: word 1
212 |         :param entity2: word 2
213 |         :return: similarity between two words
214 |         :type entity1: str
215 |         :type entity2: str
216 |         :rtype: float
217 |         """
218 |         r = requests.post(self.url + ':' + self.port + '/similarity',
219 |                           json={'entity1': entity1, 'entity2': entity2})
220 |         return r.json()['similarity']
221 | 
222 | # reference: https://radimrehurek.com/gensim/models/keyedvectors.html
223 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package has automated unit-tests for shorttext.
3 | """
4 | 


--------------------------------------------------------------------------------
/test/test_charonehot.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | from urllib.request import urlopen
 4 | 
 5 | import shorttext
 6 | 
 7 | 
 8 | class TestCharOneHot(unittest.TestCase):
 9 |     def test_BigTxt(self):
10 |         chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(
11 |             urlopen('http://norvig.com/big.txt'),
12 |             encoding='utf-8'
13 |         )
14 |         self.assertEqual(93, len(chartovec_encoder.dictionary))
15 |         self.assertEqual('\n', chartovec_encoder.signalchar)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/test/test_dtm.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | import re
 4 | 
 5 | import pandas as pd
 6 | import shorttext
 7 | from shorttext.utils import stemword, tokenize
 8 | 
 9 | 
10 | class TestDTM(unittest.TestCase):
11 |     def test_inaugural(self):
12 |         # preparing data
13 |         usprez = shorttext.data.inaugural()
14 |         docids = sorted(usprez.keys())
15 |         usprez = [' '.join(usprez[docid]) for docid in docids]
16 |         usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez})
17 |         usprezdf = usprezdf[['yrprez', 'speech']]
18 | 
19 |         # preprocesser defined
20 |         pipeline = [lambda s: re.sub('[^\w\s]', '', s),
21 |                     lambda s: re.sub('[\d]', '', s),
22 |                     lambda s: s.lower(),
23 |                     lambda s: ' '.join([stemword(token) for token in tokenize(s)])
24 |                     ]
25 |         txtpreprocessor = shorttext.utils.text_preprocessor(pipeline)
26 | 
27 |         # corpus making
28 |         docids = list(usprezdf['yrprez'])
29 |         corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']]
30 | 
31 |         # making DTM
32 |         dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
33 | 
34 |         # check results
35 |         self.assertEqual(len(dtm.dictionary), 5256)
36 |         self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138,
37 |                                places=3)
38 |         numdocs, numtokens = dtm.dtm.shape
39 |         self.assertEqual(numdocs, 56)
40 |         self.assertEqual(numtokens, 5256)
41 |         self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407,
42 |                                places=3)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/test/test_fuzzylogic.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | 
 4 | import shorttext
 5 | 
 6 | 
 7 | class TestFuzzyLogic(unittest.TestCase):
 8 |     def test_similarity(self):
 9 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('debug', 'deubg'), 1)
10 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('intrdependence', 'interdpeendencae'), 3)
11 |         self.assertEqual(shorttext.metrics.dynprog.lcp.longest_common_prefix('debug', 'debuag'), 4)
12 | 
13 |     def test_transposition(self):
14 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('independent', 'indeepndent'), 1)
15 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('providence', 'porvidecne'), 2)
16 | 
17 |     def test_insertion(self):
18 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorithms'), 1)
19 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoarithmm'), 2)
20 | 
21 |     def test_deletion(self):
22 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoithm'), 1)
23 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorith'), 1)
24 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algrihm'), 2)
25 | 
26 |     def test_correct(self):
27 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('python', 'python'), 0)
28 |         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('sosad', 'sosad'), 0)
29 | 
30 |     def test_jaccard(self):
31 |         self.assertAlmostEqual(shorttext.metrics.dynprog.jaccard.similarity('diver', 'driver'), 5./6.)
32 | 
33 | if __name__ == '__main__':
34 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_norvigspell.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | from urllib.request import urlopen
 4 | 
 5 | import shorttext
 6 | 
 7 | 
 8 | class TestSpellCheck(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.text = urlopen('http://norvig.com/big.txt').read()
11 |         self.text = self.text.decode('utf-8')
12 | 
13 |     def test_norvig(self):
14 |         speller = shorttext.spell.NorvigSpellCorrector()
15 |         speller.train(self.text)
16 |         self.assertEqual(speller.correct('apple'), 'apple')
17 |         self.assertEqual(speller.correct('appl'), 'apply')
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/test/test_sakaguchispell.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | import os
 4 | 
 5 | from shorttext.spell.sakaguchi import SCRNNSpellCorrector
 6 | from shorttext.smartload import smartload_compact_model
 7 | 
 8 | 
 9 | class TestSCRNN(unittest.TestCase):
10 |     def generalproc(self, operation, typo='langudge', recommendation='language'):
11 |         corrector = SCRNNSpellCorrector(operation)
12 |         corrector.train('I am a nerd . Natural language processing is sosad .')
13 |         corrector.save_compact_model('./sosad_'+operation+'_sakaguchi.bin')
14 | 
15 |         corrector2 = smartload_compact_model('./sosad_'+operation+'_sakaguchi.bin', None)
16 |         self.assertEqual(corrector.correct(typo), corrector2.correct(typo))
17 | 
18 |         print('typo: '+typo+'  recommendation: '+corrector.correct(typo)+' ('+recommendation+')')
19 | 
20 |         os.remove('./sosad_'+operation+'_sakaguchi.bin')
21 | 
22 |     def test_NOISE_INSERT(self):
23 |         self.generalproc('NOISE-INSERT')
24 | 
25 |     def test_NOISE_DELETE(self):
26 |         self.generalproc('NOISE-DELETE')
27 | 
28 |     def test_NOISE_REPLACE(self):
29 |         self.generalproc('NOISE-REPLACE', typo='procsesing', recommendation='processing')
30 | 
31 |     def test_JUMBLE_WHOLE(self):
32 |         self.generalproc('JUMBLE-WHOLE')
33 | 
34 |     def test_JUMBLE_BEG(self):
35 |         self.generalproc('JUMBLE-BEG')
36 | 
37 |     def test_JUMBLE_END(self):
38 |         self.generalproc('JUMBLE-END')
39 | 
40 |     def test_JUMBLE_INT(self):
41 |         self.generalproc('JUMBLE-INT')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 
47 | 


--------------------------------------------------------------------------------
/test/test_stacking.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import unittest
  3 | import os
  4 | 
  5 | import shorttext
  6 | from shorttext.stack import LogisticStackedGeneralization
  7 | from shorttext.smartload import smartload_compact_model
  8 | from sklearn.svm import SVC
  9 | 
 10 | 
 11 | class TestStacking(unittest.TestCase):
 12 |     def setUp(self):
 13 |         self.nihdict = shorttext.data.nihreports(sample_size=None)
 14 | 
 15 |     def tearDown(self):
 16 |         for filepath in os.listdir('.'):
 17 |             if filepath.endswith('.bin'):
 18 |                 os.remove(os.path.join('.', filepath))
 19 | 
 20 |     def training_stacking(self):
 21 |         # loading NIH Reports
 22 |         nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']}
 23 | 
 24 |         # maxent
 25 |         maxent_classifier = shorttext.classifiers.MaxEntClassifier()
 26 |         maxent_classifier.train(nihdict, nb_epochs=100)
 27 |         maxent_classifier.save_compact_model('./bio_maxent.bin')
 28 | 
 29 |         # SVM + LDA
 30 |         topicmodeler = shorttext.generators.LDAModeler()
 31 |         topicmodeler.train(nihdict, 8)
 32 |         topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler)
 33 |         topicmodeler.save_compact_model('./bio_lda.bin')
 34 |         svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC())
 35 |         svm_classifier.train(nihdict)
 36 |         svm_classifier.save_compact_model('./bio_svm.bin')
 37 | 
 38 |         # logistic
 39 |         stacked_classifier = LogisticStackedGeneralization({'maxent': maxent_classifier,
 40 |                                                             'svm': svm_classifier,
 41 |                                                             'topiccosine': topicdisclassifier})
 42 |         stacked_classifier.train(nihdict)
 43 |         stacked_classifier.save_compact_model('./bio_logistics.bin')
 44 | 
 45 |         return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier
 46 | 
 47 |     def comparedict(self, dict1, dict2):
 48 |         self.assertTrue(len(dict1)==len(dict2))
 49 |         print(dict1, dict2)
 50 |         for classlabel in dict1:
 51 |             self.assertTrue(classlabel in dict2)
 52 |             self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4)
 53 | 
 54 |     def testStudies(self):
 55 |         # train
 56 |         maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = self.training_stacking()
 57 |         topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler)
 58 | 
 59 |         # smartload
 60 |         maxent_classifier2 = smartload_compact_model('./bio_maxent.bin', None)
 61 |         topicmodeler2 = smartload_compact_model('./bio_lda.bin', None)
 62 |         topicdisclassifier2 = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler2)
 63 |         svm_classifier2 = smartload_compact_model('./bio_svm.bin', None)
 64 |         stacked_classifier2 = LogisticStackedGeneralization({'maxent': maxent_classifier2,
 65 |                                                              'svm': svm_classifier2,
 66 |                                                              'topiccosine': topicdisclassifier2})
 67 |         stacked_classifier2.load_compact_model('./bio_logistics.bin')
 68 | 
 69 |         # compare
 70 |         terms = ['stem cell', 'grant', 'system biology']
 71 |         for term in terms:
 72 |             print(term)
 73 |             print('maximum entropy')
 74 |             self.comparedict(maxent_classifier.score(term), maxent_classifier2.score(term))
 75 |             print('LDA')
 76 |             self.comparedict(topicdisclassifier.score(term), topicdisclassifier2.score(term))
 77 |             print('SVM')
 78 |             self.comparedict(svm_classifier.score(term), svm_classifier2.score(term))
 79 |             print('combined')
 80 |             self.comparedict(stacked_classifier.score(term), stacked_classifier2.score(term))
 81 | 
 82 |     def testSVM(self):
 83 |         # loading NIH Reports
 84 |         nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']}
 85 | 
 86 |         # svm
 87 |         topicmodeler = shorttext.generators.LDAModeler()
 88 |         topicmodeler.train(nihdict, 16)
 89 |         svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC())
 90 |         svm_classifier.train(nihdict)
 91 |         print('before saving...')
 92 |         print('--'.join(svm_classifier.classlabels))
 93 |         print('--'.join(svm_classifier.topicmodeler.classlabels))
 94 |         svm_classifier.save_compact_model('./bio_svm2.bin')
 95 |         print('after saving...')
 96 |         print('--'.join(svm_classifier.classlabels))
 97 |         print('--'.join(svm_classifier.topicmodeler.classlabels))
 98 | 
 99 |         # load
100 |         svm_classifier2 = smartload_compact_model('./bio_svm2.bin', None)
101 |         print('second classifier...')
102 |         print(','.join(svm_classifier2.classlabels))
103 |         print(','.join(svm_classifier2.topicmodeler.classlabels))
104 | 
105 |         # compare
106 |         terms = ['stem cell', 'grant', 'system biology']
107 |         for term in terms:
108 |             print(term)
109 |             topicvec = svm_classifier.getvector(term)
110 |             topicvec2 = svm_classifier2.getvector(term)
111 |             print(topicvec)
112 |             print(topicvec2)
113 |             for idx, classlabel in enumerate(svm_classifier.classlabels):
114 |                 print(str(idx)+' '+classlabel)
115 |                 print(svm_classifier.classifier.score([topicvec], [idx]))
116 |             for idx, classlabel in enumerate(svm_classifier2.classlabels):
117 |                 print(str(idx) + ' ' + classlabel)
118 |                 print(svm_classifier2.classifier.score([topicvec2], [idx]))
119 |             print({classlabel: svm_classifier.classifier.score([topicvec], [idx])
120 |                    for idx, classlabel in enumerate(svm_classifier.classlabels)})
121 |             print({classlabel: svm_classifier2.classifier.score([topicvec], [idx])
122 |                    for idx, classlabel in enumerate(svm_classifier2.classlabels)})
123 | 
124 |         for term in terms:
125 |             print(term)
126 |             self.comparedict(svm_classifier.score(term), svm_classifier2.score(term))
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     unittest.main()
131 | 
132 | 


--------------------------------------------------------------------------------
/test/test_textpreprocessing.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | 
 4 | import shorttext
 5 | 
 6 | class TestTextPreprocessing(unittest.TestCase):
 7 |     def testStandardPipeline(self):
 8 |         preprocessor = shorttext.utils.standard_text_preprocessor_1()
 9 |         self.assertEqual(preprocessor('I love you.'), 'love')
10 |         self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire')
11 |         self.assertEqual(preprocessor('I do not think.'), 'think')
12 | 
13 |     def testStandPipelineDifferentStopwords(self):
14 |         preprocessor = shorttext.utils.standard_text_preprocessor_2()
15 |         self.assertEqual(preprocessor('I love you.'), 'love')
16 |         self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire')
17 |         self.assertEqual(preprocessor('I do not think.'), 'not think')
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_var_nn_embedded_vec_classifier.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import unittest
  4 | import urllib
  5 | 
  6 | import shorttext
  7 | 
  8 | 
  9 | class TestVarNNEmbeddedVecClassifier(unittest.TestCase):
 10 |     def setUp(self):
 11 |         print("Downloading word-embedding model....")
 12 |         link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
 13 |         filename = "test_w2v_model.bin"
 14 |         if not os.path.isfile("test_w2v_model.bin"):
 15 |             urllib.request.urlretrieve(link, filename)
 16 |         self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True)  # load word2vec model
 17 |         self.trainclass_dict = shorttext.data.subjectkeywords()  # load training data
 18 | 
 19 |     def tearDown(self):
 20 |         print("Removing word-embedding model")
 21 |         if os.path.isfile("test_w2v_model.bin"):
 22 |             os.remove('test_w2v_model.bin')
 23 | 
 24 |     def comparedict(self, dict1, dict2):
 25 |         self.assertTrue(len(dict1)==len(dict2))
 26 |         print(dict1, dict2)
 27 |         for classlabel in dict1:
 28 |             self.assertTrue(classlabel in dict2)
 29 |             self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4)
 30 | 
 31 |     def testCNNWordEmbedWithoutGensim(self):
 32 |         print("Testing CNN...")
 33 |         # create keras model using `CNNWordEmbed` class
 34 |         print("\tKeras model")
 35 |         keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model,
 36 |                                                                     nb_labels=len(self.trainclass_dict.keys()))
 37 | 
 38 |         # create and train classifier using keras model constructed above
 39 |         print("\tTraining")
 40 |         main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
 41 |         main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
 42 | 
 43 |         # compute classification score
 44 |         print("\tTesting")
 45 |         score_vals = main_classifier.score('artificial intelligence')
 46 |         self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
 47 | 
 48 |     def testDoubleCNNWordEmbedWithoutGensim(self):
 49 |         print("Testing DoubleCNN...")
 50 |         # create keras model using `DoubleCNNWordEmbed` class
 51 |         print("\tKeras model")
 52 |         keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model,
 53 |                                                                           nb_labels=len(self.trainclass_dict.keys()))
 54 | 
 55 |         # create and train classifier using keras model constructed above
 56 |         print("\tTraining")
 57 |         main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
 58 |         main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
 59 | 
 60 |         # compute classification score
 61 |         print("\tTesting")
 62 |         score_vals = main_classifier.score('artificial intelligence')
 63 |         self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
 64 | 
 65 |     def testCLSTMWordEmbedWithoutGensim(self):
 66 |         print("Testing CLSTM...")
 67 |         # create keras model using `CLSTMWordEmbed` class
 68 |         print("\tKeras model")
 69 |         keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model,
 70 |                                                                       nb_labels=len(self.trainclass_dict.keys()))
 71 | 
 72 |         # create and train classifier using keras model constructed above
 73 |         print("\tTraining")
 74 |         main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
 75 |         main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
 76 | 
 77 |         # compute classification score
 78 |         print("\tTesting")
 79 |         score_vals = main_classifier.score('artificial intelligence')
 80 |         self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
 81 | 
 82 |     def testAASumEmbed(self):
 83 |         print("Testing SumEmbed")
 84 |         classifier = shorttext.classifiers.SumEmbeddedVecClassifier(self.w2v_model)
 85 |         classdict = shorttext.data.subjectkeywords()
 86 |         classifier.train(classdict)
 87 | 
 88 |         # compute
 89 |         self.comparedict(classifier.score('linear algebra'),
 90 |                          {'mathematics': 0.9044698253778962,
 91 |                           'physics': 0.7586816549044926,
 92 |                           'theology': 0.1817602793151848})
 93 |         self.comparedict(classifier.score('learning'),
 94 |                          {'mathematics': 0.9037142562255835,
 95 |                           'physics': 0.7588376500004107,
 96 |                           'theology': 0.18039468994239538})
 97 |         self.comparedict(classifier.score('eschatology'),
 98 |                          {'mathematics': 0.3658578123294476,
 99 |                           'physics': 0.5996711864493821,
100 |                           'theology': 0.9694560847986978})
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     unittest.main()
105 | 


--------------------------------------------------------------------------------
/test/test_wmd.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import urllib
 4 | 
 5 | from shorttext.metrics.wasserstein import word_mover_distance
 6 | from shorttext.utils import load_word2vec_model
 7 | 
 8 | 
 9 | class TestWMD(unittest.TestCase):
10 |     def setUp(self):
11 |         print("Downloading word-embedding model....")
12 |         link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
13 |         filename = "test_w2v_model.bin"
14 |         if not os.path.isfile("test_w2v_model.bin"):
15 |             urllib.request.urlretrieve(link, filename)
16 |         self.w2v_model = load_word2vec_model(filename, binary=True)  # load word2vec model
17 | 
18 |     def tearDown(self):
19 |         print("Removing word-embedding model")
20 |         if os.path.isfile("test_w2v_model.bin"):
21 |             os.remove('test_w2v_model.bin')
22 | 
23 |     def calculate_wmd(self, tokens1, tokens2, answer):
24 |         wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model)
25 |         self.assertAlmostEqual(wdistance, answer, delta=1e-3)
26 | 
27 |     def test_metrics(self):
28 |         tokens1 = ['president', 'speaks']
29 |         tokens2 = ['president', 'talks']
30 |         known_answer = 0.19936788082122803
31 |         self.calculate_wmd(tokens1, tokens2, known_answer)
32 | 
33 |         tokens1 = ['fan', 'book']
34 |         tokens2 = ['apple', 'orange']
35 |         known_answer = 1.8019972145557404
36 |         self.calculate_wmd(tokens1, tokens2, known_answer)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     unittest.main()
41 | 


--------------------------------------------------------------------------------