├── .github
    └── workflows
    │   ├── runtests.yml
    │   └── stale.yml
├── .gitignore
├── .readthedocs.yaml
├── AUTHORS.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── conftest.py
├── coverage.svg
├── doc
    ├── Makefile
    └── source
    │   ├── api.rst
    │   ├── bow.ipynb
    │   ├── conf.py
    │   ├── data
    │       ├── corpus_example
    │       │   ├── sample1.txt
    │       │   ├── sample2.txt
    │       │   └── sample3.txt
    │       ├── news_articles_100.pickle
    │       ├── news_articles_100.xlsx
    │       └── tm_wordclouds
    │       │   └── .gitignore
    │   ├── development.rst
    │   ├── getting_started.ipynb
    │   ├── index.rst
    │   ├── install.rst
    │   ├── intro.rst
    │   ├── license_note.rst
    │   ├── preprocessing.ipynb
    │   ├── text_corpora.ipynb
    │   ├── topic_modeling.ipynb
    │   └── version_history.rst
├── examples
    ├── README.md
    ├── __init__.py
    ├── _benchmarktools.py
    ├── benchmark_en_newsarticles.py
    ├── bundestag18_tfidf.py
    ├── data
    │   ├── ap.pickle
    │   ├── bt18_full.zip
    │   ├── bt18_sample_1000.pickle
    │   ├── gensim_evaluation_plot.png
    │   └── nips.pickle
    ├── gensim_evaluation.py
    ├── minimal_tfidf.py
    ├── topicmod_ap_nips_eval.py
    └── topicmod_lda.py
├── requirements.txt
├── requirements_doc.txt
├── scripts
    ├── fulldata
    │   ├── .gitignore
    │   └── README.md
    ├── nips_data.py
    ├── prepare_corpora.R
    └── tmp
    │   └── .gitignore
├── setup.py
├── tests
    ├── __init__.py
    ├── _testtextdata.py
    ├── _testtools.py
    ├── data
    │   ├── .gitignore
    │   ├── 100NewsArticles.csv
    │   ├── 100NewsArticles.xlsx
    │   ├── 3ExampleDocs.xlsx
    │   ├── bt18_speeches_sample.csv
    │   ├── gutenberg
    │   │   ├── kafka_verwandlung.txt
    │   │   └── werther
    │   │   │   ├── goethe_werther1.txt
    │   │   │   └── goethe_werther2.txt
    │   ├── tiny_model_reuters_5_topics.pickle
    │   └── zipdata.zip
    ├── test_bow.py
    ├── test_corpus.py
    ├── test_corpusimport.py
    ├── test_tokenseq.py
    ├── test_topicmod__eval_tools.py
    ├── test_topicmod_evaluate.py
    ├── test_topicmod_model_io.py
    ├── test_topicmod_model_stats.py
    ├── test_topicmod_visualize.py
    └── test_utils.py
├── tmtoolkit
    ├── __init__.py
    ├── __main__.py
    ├── bow
    │   ├── __init__.py
    │   ├── bow_stats.py
    │   └── dtm.py
    ├── corpus
    │   ├── __init__.py
    │   ├── _common.py
    │   ├── _corpus.py
    │   ├── _corpusfuncs.py
    │   ├── _document.py
    │   ├── _nltk_extras.py
    │   └── visualize.py
    ├── data
    │   ├── de
    │   │   └── parlspeech-v2-sample-bundestag.zip
    │   ├── en
    │   │   ├── News100.zip
    │   │   ├── NewsArticles.zip
    │   │   └── parlspeech-v2-sample-houseofcommons.zip
    │   ├── es
    │   │   └── parlspeech-v2-sample-congreso.zip
    │   └── nl
    │   │   └── parlspeech-v2-sample-tweedekamer.zip
    ├── tokenseq.py
    ├── topicmod
    │   ├── __init__.py
    │   ├── _common.py
    │   ├── _eval_tools.py
    │   ├── evaluate.py
    │   ├── model_io.py
    │   ├── model_stats.py
    │   ├── parallel.py
    │   ├── tm_gensim.py
    │   ├── tm_lda.py
    │   ├── tm_sklearn.py
    │   └── visualize.py
    ├── types.py
    └── utils.py
└── tox.ini


/.github/workflows/runtests.yml:
--------------------------------------------------------------------------------
 1 | # GitHub actions workflow for testing tmtoolkit
 2 | # Runs tests on Ubuntu, MacOS and Windows with Python versions 3.8, 3.9 and 3.10 each, which means 9 jobs are spawned.
 3 | # Tests are run using tox (https://tox.wiki/).
 4 | #
 5 | # author: Markus Konrad <markus.konrad@wzb.eu>
 6 | 
 7 | name: run tests
 8 | 
 9 | on:
10 |   push:
11 |     branches:
12 |       - master
13 |       - develop
14 |       - 'release*'
15 | 
16 | jobs:
17 |   build:
18 |     runs-on: ${{ matrix.os }}
19 |     strategy:
20 |       matrix:
21 |         os: [ubuntu-latest, macos-latest, windows-latest]
22 |         python-version: ["3.8", "3.9", "3.10"]
23 |         testsuite: ["minimal", "full"]
24 |     steps:
25 |       - uses: actions/checkout@v2
26 |       - name: set up python ${{ matrix.python-version }}
27 |         uses: actions/setup-python@v2
28 |         with:
29 |           python-version: ${{ matrix.python-version }}
30 |           cache: 'pip'
31 |       - name: install system dependencies (linux)
32 |         if: runner.os == 'Linux'
33 |         # only managed to install system dependencies on Linux runners
34 |         run: |
35 |           sudo apt update
36 |           sudo apt install libgmp-dev libmpfr-dev libmpc-dev
37 |       - name: install python dependencies
38 |         run: |
39 |           python -m pip install --upgrade pip
40 |           pip install tox
41 |       - name: run tox (linux)
42 |         # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ...
43 |         if: runner.os == 'Linux'
44 |         run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci
45 |       - name: run tox (macos or windows - minimal)
46 |         if: runner.os != 'Linux' && matrix.testsuite == 'minimal'
47 |         run: tox -e py-minimal -- --hypothesis-profile=ci
48 |       - name: run tox (macos or windows - recommendedextra)
49 |         # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite
50 |         if: runner.os != 'Linux' && matrix.testsuite == 'full'
51 |         run: tox -e py-recommendedextra -- --hypothesis-profile=ci
52 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "23 3 * * *"
 5 | 
 6 | jobs:
 7 |   close-issues:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |       pull-requests: write
12 |     steps:
13 |       - uses: actions/stale@v3
14 |         with:
15 |           days-before-issue-stale: 30
16 |           days-before-issue-close: 14
17 |           stale-issue-label: "stale"
18 |           stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
19 |           close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
20 |           days-before-pr-stale: -1
21 |           days-before-pr-close: -1
22 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
23 | 
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .cache/
 2 | .idea/
 3 | **/__pycache__
 4 | *.pyc
 5 | .hypothesis
 6 | build/
 7 | dist/
 8 | *.egg-info/
 9 | .~lock.*
10 | examples/data/*.pickle
11 | !examples/data/ap.pickle
12 | !examples/data/nips.pickle
13 | !examples/data/bt18_sample_1000.pickle
14 | **/.ipynb_checkpoints/
15 | .pytest_cache/
16 | .covreport/
17 | .tox/
18 | .Rhistory
19 | doc/source/data/corpus_norm.pickle
20 | .coverage
21 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the doc/ directory with Sphinx
 9 | sphinx:
10 |   configuration: doc/source/conf.py
11 | 
12 | # Set the version of Python and other tools you might need
13 | build:
14 |   os: ubuntu-20.04
15 |   tools:
16 |     python: "3.9"
17 | 
18 | # Optionally set the version of Python and requirements required to build your docs
19 | python:
20 |   install:
21 |     - requirements: requirements_doc.txt
22 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # Authors
 2 | 
 3 | ## Maintainer / main developer
 4 | 
 5 | [Markus Konrad](https://github.com/internaut) @ [WZB](https://github.com/WZBSocialScienceCenter/)
 6 | 
 7 | ## Contributors
 8 | 
 9 | Sorted by date of first contribution:
10 | 
11 | * [Matt Cooper](https://github.com/mcooper)
12 | * [Dominik Domhoff](https://github.com/ddomhoff)
13 | * [Christof Kälin](https://github.com/christofkaelin)
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.md
 2 | include conftest.py
 3 | include LICENSE
 4 | include README.rst
 5 | include requirements.txt
 6 | include requirements_doc.txt
 7 | graft doc/source
 8 | prune doc/source/.ipynb_*
 9 | graft tmtoolkit/data
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | run_tests:
 2 | 	PYTHONPATH=. pytest -l tests/
 3 | 
 4 | cov_tests:
 5 | 	PYTHONPATH=. pytest --cov-report html:.covreport --cov=tmtoolkit tests/
 6 | 	coverage-badge -f -o coverage.svg
 7 | 	#rm .coverage*
 8 | 
 9 | sdist:
10 | 	python setup.py sdist
11 | 
12 | wheel:
13 | 	python setup.py bdist_wheel
14 | 
15 | readme:
16 | 	cat doc/source/intro.rst > README.rst
17 | 	echo >> README.rst
18 | 	echo >> README.rst
19 | 	doc/source/install.rst >> README.rst
20 | 	echo >> README.rst
21 | 	echo >> README.rst
22 | 	cat doc/source/license_note.rst >> README.rst
23 | 
24 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | **This repository is archived. Further development of tmtoolkit has moved to https://github.com/internaut/tmtoolkit.**
  2 | 
  3 | ------------
  4 | 
  5 | 
  6 | tmtoolkit: Text mining and topic modeling toolkit
  7 | =================================================
  8 | 
  9 | *tmtoolkit* is a set of tools for text mining and topic modeling with Python developed especially for the use in the
 10 | social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation
 11 | and a clear programming interface while offering good performance on large datasets by the means of vectorized
 12 | operations (via NumPy) and parallel computation (using Python's *multiprocessing* module and the
 13 | `loky <https://loky.readthedocs.io/>`_ package). The basis of tmtoolkit's text mining capabilities are built around
 14 | `SpaCy <https://spacy.io/>`_, which offers a `many language models <https://spacy.io/models>`_.
 15 | 
 16 | The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org <https://tmtoolkit.readthedocs.org>`_ and
 17 | the GitHub code repository is on
 18 | `github.com/WZBSocialScienceCenter/tmtoolkit <https://github.com/WZBSocialScienceCenter/tmtoolkit>`_.
 19 | 
 20 | **Upgrade note:**
 21 | 
 22 | Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API
 23 | for text processing and mining which is incompatible with prior versions. It's advisable to first read the
 24 | first three chapters of the `tutorial <https://tmtoolkit.readthedocs.io/en/latest/getting_started.html>`_
 25 | to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely
 26 | remove the old version prior to upgrading. See the
 27 | `installation instructions <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_.
 28 | 
 29 | Requirements and installation
 30 | -----------------------------
 31 | 
 32 | **tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).**
 33 | 
 34 | The tmtoolkit package is highly modular and tries to install as few dependencies as possible. For requirements and
 35 | installation procedures, please have a look at the
 36 | `installation section in the documentation <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_. For short,
 37 | the recommended way of installing tmtoolkit is to create and activate a
 38 | `Python Virtual Environment ("venv") <https://docs.python.org/3/tutorial/venv.html>`_ and then install tmtoolkit with
 39 | a recommended set of dependencies and a list of language models via the following:
 40 | 
 41 | .. code-block:: text
 42 | 
 43 |     pip install -U "tmtoolkit[recommended]"
 44 |     # add or remove language codes in the list for installing the models that you need;
 45 |     # don't use spaces in the list of languages
 46 |     python -m tmtoolkit setup en,de
 47 | 
 48 | Again, you should have a look at the detailed
 49 | `installation instructions <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_ in order to install additional
 50 | packages that enable more features such as topic modeling.
 51 | 
 52 | Features
 53 | --------
 54 | 
 55 | Text preprocessing
 56 | ^^^^^^^^^^^^^^^^^^
 57 | 
 58 | The tmtoolkit package offers several text preprocessing and text mining methods, including:
 59 | 
 60 | - `tokenization, sentence segmentation, part-of-speech (POS) tagging, named-entity recognition (NER) <https://tmtoolkit.readthedocs.io/en/latest/text_corpora.html#Configuring-the-NLP-pipeline,-parallel-processing-and-more-via-Corpus-parameters>`_ (via SpaCy)
 61 | - `lemmatization and token normalization <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Lemmatization-and-token-normalization>`_
 62 | - extensive `pattern matching capabilities <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Common-parameters-for-pattern-matching-functions>`_
 63 |   (exact matching, regular expressions or "glob" patterns) to be used in many
 64 |   methods of the package, e.g. for filtering on token or document level, or for
 65 |   `keywords-in-context (KWIC) <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Keywords-in-context-(KWIC)-and-general-filtering-methods>`_
 66 | - adding and managing
 67 |   `custom document and token attributes <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Working-with-document-and-token-attributes>`_
 68 | - accessing text corpora along with their
 69 |   `document and token attributes as dataframes <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Accessing-tokens-and-token-attributes>`_
 70 | - calculating and `visualizing corpus summary statistics <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Visualizing-corpus-summary-statistics>`_
 71 | - finding out and joining `collocations <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Identifying-and-joining-token-collocations>`_
 72 | - `splitting and sampling corpora <https://tmtoolkit.readthedocs.io/en/latest/text_corpora.html#Corpus-functions-for-document-management>`_
 73 | - generating `n-grams <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Generating-n-grams>`_
 74 | - generating `sparse document-term matrices <https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Generating-a-sparse-document-term-matrix-(DTM)>`_
 75 | 
 76 | Wherever possible and useful, these methods can operate in parallel to speed up computations with large datasets.
 77 | 
 78 | Topic modeling
 79 | ^^^^^^^^^^^^^^
 80 | 
 81 | * `model computation in parallel <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Computing-topic-models-in-parallel>`_ for different copora
 82 |   and/or parameter sets
 83 | * support for `lda <http://pythonhosted.org/lda/>`_,
 84 |   `scikit-learn <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html>`_
 85 |   and `gensim <https://radimrehurek.com/gensim/>`_ topic modeling backends
 86 | * `evaluation of topic models <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Evaluation-of-topic-models>`_ (e.g. in order to an optimal number
 87 |   of topics for a given dataset) using several implemented metrics:
 88 | 
 89 |    * model coherence (`Mimno et al. 2011 <https://dl.acm.org/citation.cfm?id=2145462>`_) or with
 90 |      `metrics implemented in Gensim <https://radimrehurek.com/gensim/models/coherencemodel.html>`_)
 91 |    * KL divergence method (`Arun et al. 2010 <http://doi.org/10.1007/978-3-642-13657-3_43>`_)
 92 |    * probability of held-out documents (`Wallach et al. 2009 <https://doi.org/10.1145/1553374.1553515>`_)
 93 |    * pair-wise cosine distance method (`Cao Juan et al. 2009 <http://doi.org/10.1016/j.neucom.2008.06.011>`_)
 94 |    * harmonic mean method (`Griffiths, Steyvers 2004 <http://doi.org/10.1073/pnas.0307752101>`_)
 95 |    * the loglikelihood or perplexity methods natively implemented in lda, sklearn or gensim
 96 | 
 97 | * `plotting of evaluation results <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Evaluation-of-topic-models>`_
 98 | * `common statistics for topic models <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Common-statistics-and-tools-for-topic-models>`_ such as
 99 |   word saliency and distinctiveness (`Chuang et al. 2012 <https://dl.acm.org/citation.cfm?id=2254572>`_), topic-word
100 |   relevance (`Sievert and Shirley 2014 <https://www.aclweb.org/anthology/W14-3110>`_)
101 | * `finding / filtering topics with pattern matching <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Filtering-topics>`_
102 | * `export estimated document-topic and topic-word distributions to Excel
103 |   <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Displaying-and-exporting-topic-modeling-results>`_
104 | * `visualize topic-word distributions and document-topic distributions <https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Visualizing-topic-models>`_
105 |   as word clouds or heatmaps
106 | * model coherence (`Mimno et al. 2011 <https://dl.acm.org/citation.cfm?id=2145462>`_) for individual topics
107 | * integrate `PyLDAVis <https://pyldavis.readthedocs.io/en/latest/>`_ to visualize results
108 | 
109 | Other features
110 | ^^^^^^^^^^^^^^
111 | 
112 | - loading and cleaning of raw text from
113 |   `text files, tabular files (CSV or Excel), ZIP files or folders <https://tmtoolkit.readthedocs.io/en/latest/text_corpora.html#Loading-text-data>`_
114 | - `splitting and joining documents <https://tmtoolkit.readthedocs.io/en/latest/text_corpora.html#Corpus-functions-for-document-management>`_
115 | - `common statistics and transformations for document-term matrices <https://tmtoolkit.readthedocs.io/en/latest/bow.html>`_ like word cooccurrence and *tf-idf*
116 | 
117 | Limits
118 | ------
119 | 
120 | * all languages are supported, for which `SpaCy language models <https://spacy.io/models>`_ are available
121 | * all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example
122 |   `Gensim <https://radimrehurek.com/gensim/>`_ supports)
123 | 
124 | 
125 | Contribute
126 | ----------
127 | 
128 | If you'd like to contribute, please read the `developer documentation <https://tmtoolkit.readthedocs.io/en/latest/development.html>`_ first.
129 | 
130 | 
131 | License
132 | -------
133 | 
134 | Code licensed under `Apache License 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
135 | See `LICENSE <https://github.com/WZBSocialScienceCenter/tmtoolkit/blob/master/LICENSE>`_ file.
136 | 
137 | .. |pypi| image:: https://badge.fury.io/py/tmtoolkit.svg
138 |     :target: https://badge.fury.io/py/tmtoolkit
139 |     :alt: PyPI Version
140 | 
141 | .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/tmtoolkit
142 |     :target: https://pypi.org/project/tmtoolkit/
143 |     :alt: Downloads from PyPI
144 | 
145 | .. |runtests| image:: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml/badge.svg
146 |     :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml
147 |     :alt: GitHub Actions CI Build Status
148 | 
149 | .. |coverage| image:: https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/master/coverage.svg?sanitize=true
150 |     :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/tests
151 |     :alt: Coverage status
152 | 
153 | .. |rtd| image:: https://readthedocs.org/projects/tmtoolkit/badge/?version=latest
154 |     :target: https://tmtoolkit.readthedocs.io/en/latest/?badge=latest
155 |     :alt: Documentation Status
156 | 
157 | .. |zenodo| image:: https://zenodo.org/badge/109812180.svg
158 |     :target: https://zenodo.org/badge/latestdoi/109812180
159 |     :alt: Citable Zenodo DOI
160 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration for tests with pytest
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | from hypothesis import settings, HealthCheck
 8 | 
 9 | 
10 | # set default timeout deadline
11 | settings.register_profile('default', deadline=5000)
12 | 
13 | # profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck
14 | # and set the timeout deadline very high (60 sec.)
15 | settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000)
16 | 
17 | # load default settings profile
18 | settings.load_profile('default')
19 | 


--------------------------------------------------------------------------------
/coverage.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
 3 |     <linearGradient id="b" x2="0" y2="100%">
 4 |         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
 5 |         <stop offset="1" stop-opacity=".1"/>
 6 |     </linearGradient>
 7 |     <mask id="a">
 8 |         <rect width="99" height="20" rx="3" fill="#fff"/>
 9 |     </mask>
10 |     <g mask="url(#a)">
11 |         <path fill="#555" d="M0 0h63v20H0z"/>
12 |         <path fill="#a4a61d" d="M63 0h36v20H63z"/>
13 |         <path fill="url(#b)" d="M0 0h99v20H0z"/>
14 |     </g>
15 |     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
16 |         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
17 |         <text x="31.5" y="14">coverage</text>
18 |         <text x="80" y="15" fill="#010101" fill-opacity=".3">83%</text>
19 |         <text x="80" y="14">83%</text>
20 |     </g>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | notebooks:
16 | 	jupyter nbconvert --to notebook --execute --inplace --ExecutePreprocessor.timeout=600 --PlainTextFormatter.max_seq_length=20 source/*.ipynb
17 | 
18 | .PHONY: help Makefile
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/doc/source/api.rst:
--------------------------------------------------------------------------------
  1 | .. _api:
  2 | 
  3 | API
  4 | ===
  5 | 
  6 | tmtoolkit.bow
  7 | -------------
  8 | 
  9 | tmtoolkit.bow.bow_stats
 10 | ^^^^^^^^^^^^^^^^^^^^^^^
 11 | 
 12 | .. automodule:: tmtoolkit.bow.bow_stats
 13 |     :members:
 14 | 
 15 | tmtoolkit.bow.dtm
 16 | ^^^^^^^^^^^^^^^^^
 17 | 
 18 | .. automodule:: tmtoolkit.bow.dtm
 19 |     :members:
 20 | 
 21 | 
 22 | tmtoolkit.corpus
 23 | ----------------
 24 | 
 25 | Corpus class and corpus functions
 26 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 27 | 
 28 | .. automodule:: tmtoolkit.corpus
 29 |     :members:
 30 |     :imported-members:
 31 |     :exclude-members: find_spec, strip_tags, numbertoken_to_magnitude, simplify_unicode_chars, visualize
 32 | 
 33 | Functions to visualize corpus summary statistics
 34 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 35 | 
 36 | .. automodule:: tmtoolkit.corpus.visualize
 37 |     :members:
 38 | 
 39 | 
 40 | tmtoolkit.tokenseq
 41 | ------------------
 42 | 
 43 | .. automodule:: tmtoolkit.tokenseq
 44 |     :members:
 45 | 
 46 | 
 47 | tmtoolkit.topicmod
 48 | ------------------
 49 | 
 50 | .. automodule:: tmtoolkit.topicmod
 51 |     :members:
 52 | 
 53 | Evaluation metrics for Topic Modeling
 54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 55 | 
 56 | .. automodule:: tmtoolkit.topicmod.evaluate
 57 |     :members:
 58 | 
 59 | 
 60 | Printing, importing and exporting topic model results
 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 62 | 
 63 | .. automodule:: tmtoolkit.topicmod.model_io
 64 |     :members:
 65 | 
 66 | 
 67 | Statistics for topic models and BoW matrices
 68 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 69 | 
 70 | .. automodule:: tmtoolkit.topicmod.model_stats
 71 |     :members:
 72 | 
 73 | 
 74 | Parallel model fitting and evaluation with lda
 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 76 | 
 77 | .. automodule:: tmtoolkit.topicmod.tm_lda
 78 |    :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models
 79 | 
 80 | 
 81 | Parallel model fitting and evaluation with scikit-learn
 82 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 83 | 
 84 | .. automodule:: tmtoolkit.topicmod.tm_sklearn
 85 |    :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models
 86 | 
 87 | 
 88 | Parallel model fitting and evaluation with Gensim
 89 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 90 | 
 91 | .. automodule:: tmtoolkit.topicmod.tm_gensim
 92 |    :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models
 93 | 
 94 | 
 95 | Visualize topic models and topic model evaluation results
 96 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 97 | 
 98 | Wordclouds from topic models
 99 | """"""""""""""""""""""""""""
100 | 
101 | .. autodata:: tmtoolkit.topicmod.visualize.DEFAULT_WORDCLOUD_KWARGS
102 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_for_topic_words
103 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_for_document_topics
104 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordcloud_from_probabilities_and_words
105 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordcloud_from_weights
106 | .. autofunction:: tmtoolkit.topicmod.visualize.write_wordclouds_to_folder
107 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_from_distribution
108 | 
109 | Plot heatmaps for topic models
110 | """"""""""""""""""""""""""""""
111 | 
112 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_doc_topic_heatmap
113 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_topic_word_heatmap
114 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_heatmap
115 | 
116 | Plot probability distribution rankings for topic models
117 | """""""""""""""""""""""""""""""""""""""""""""""""""""""
118 | 
119 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_topic_word_ranked_prob
120 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_doc_topic_ranked_prob
121 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_prob_distrib_ranked_prob
122 | 
123 | Plot topic model evaluation results
124 | """""""""""""""""""""""""""""""""""
125 | 
126 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_eval_results
127 | 
128 | Other functions
129 | """""""""""""""
130 | 
131 | .. autofunction:: tmtoolkit.topicmod.visualize.parameters_for_ldavis
132 | 
133 | 
134 | Base classes for parallel model fitting and evaluation
135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136 | 
137 | .. automodule:: tmtoolkit.topicmod.parallel
138 |     :members:
139 | 
140 | 
141 | tmtoolkit.utils
142 | ---------------
143 | 
144 | .. automodule:: tmtoolkit.utils
145 |     :members:
146 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | 
14 | import os
15 | import sys
16 | from datetime import date
17 | 
18 | import sphinx_rtd_theme
19 | 
20 | sys.path.insert(0, os.path.abspath('../..'))
21 | 
22 | 
23 | # -- Project information -----------------------------------------------------
24 | 
25 | project = 'tmtoolkit'
26 | copyright = f'{date.today().year}, Markus Konrad'
27 | author = 'Markus Konrad'
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 |     'nbsphinx',
37 |     'sphinx.ext.autodoc',
38 |     'sphinx_rtd_theme'
39 | ]
40 | 
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 | 
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | 
48 | exclude_patterns = ['**.ipynb_checkpoints']
49 | 
50 | 
51 | # If true, '()' will be appended to :func: etc. cross-reference text.
52 | add_function_parentheses = False
53 | 
54 | # If true, the current module name will be prepended to all description
55 | # unit titles (such as .. function::).
56 | add_module_names = True
57 | 
58 | # type hints
59 | autodoc_typehints = 'description'
60 | autodoc_typehints_format = 'short'
61 | 
62 | # The name of the Pygments (syntax highlighting) style to use.
63 | pygments_style = 'sphinx'
64 | 
65 | # -- Options for HTML output -------------------------------------------------
66 | 
67 | # The theme to use for HTML and HTML Help pages.  See the documentation for
68 | # a list of builtin themes.
69 | #
70 | 
71 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
72 | html_theme = "sphinx_rtd_theme"
73 | # html_static_path = ['static']
74 | 
75 | # Output file base name for HTML help builder.
76 | htmlhelp_basename = '%sdoc' % project
77 | 
78 | # Never skip __init__
79 | 
80 | def skip(app, what, name, obj, would_skip, options):
81 |     if name == "__init__":
82 |         return False
83 |     return would_skip
84 | 
85 | def setup(app):
86 |     app.connect("autodoc-skip-member", skip)
87 | 


--------------------------------------------------------------------------------
/doc/source/data/corpus_example/sample1.txt:
--------------------------------------------------------------------------------
1 | This is the first example file. ☺ We showcase NER by just randomly listing famous people like Missy Elliott or George Harrison.
2 | 


--------------------------------------------------------------------------------
/doc/source/data/corpus_example/sample2.txt:
--------------------------------------------------------------------------------
1 | Here comes the second example (with HTML <i>tags</i> &amp; entities).
2 | 
3 | This one contains three lines of plain text which means two paragraphs.


--------------------------------------------------------------------------------
/doc/source/data/corpus_example/sample3.txt:
--------------------------------------------------------------------------------
1 | And here we go with the third and final example file.
2 | Another line of text.
3 | 
4 | §2.
5 | This is the second paragraph.
6 | 
7 | The third and final paragraph.


--------------------------------------------------------------------------------
/doc/source/data/news_articles_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/doc/source/data/news_articles_100.pickle


--------------------------------------------------------------------------------
/doc/source/data/news_articles_100.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/doc/source/data/news_articles_100.xlsx


--------------------------------------------------------------------------------
/doc/source/data/tm_wordclouds/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 
6 | 


--------------------------------------------------------------------------------
/doc/source/development.rst:
--------------------------------------------------------------------------------
  1 | .. _development:
  2 | 
  3 | Development
  4 | ===========
  5 | 
  6 | This part of the documentation serves as developer documentation, i.e. a help for those who want to contribute to the development of the package.
  7 | 
  8 | 
  9 | Project overview
 10 | ----------------
 11 | 
 12 | This project aims to provide a Python package that allows text processing, text mining and topic modeling with
 13 | 
 14 | - easy installation,
 15 | - extensive documentation,
 16 | - clear functional programming interface,
 17 | - good performance on large datasets.
 18 | 
 19 | All computations need to be performed in memory. Streaming data from disk is not supported so far.
 20 | 
 21 | The package is written in Python and uses other packages for key tasks:
 22 | 
 23 | - `SpaCy <https://spacy.io/>`_ is used for the text processing and text mining tasks
 24 | - `lda <http://pythonhosted.org/lda/>`_, `gensim <https://radimrehurek.com/gensim/>`_ or `scikit-learn <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html>`_ are used for computing topic models
 25 | 
 26 | The project's packages are published to the `Python Package Index PyPI <https://pypi.org/project/tmtoolkit/>`_.
 27 | 
 28 | The package's dependencies are only installed on demand. There's a setup routine that provides an interface for easy installation of SpaCy's language models.
 29 | 
 30 | Text processing and normalization is often used to construct a Bag-of-Words (BoW) model which in turn is the input for topic models.
 31 | 
 32 | 
 33 | Contributing to tmtoolkit
 34 | -------------------------
 35 | 
 36 | If you want to contribute to tmtoolkit, you can create code or documentation patches (updates) and submit them as `pull requests <https://github.com/WZBSocialScienceCenter/tmtoolkit/pulls>`_ on GitHub. The first thing to do for this is to fork the `GitHub repository <https://github.com/WZBSocialScienceCenter/tmtoolkit>`_ and to clone it on your local machine. It's best to create a separate branch for your updates next. You should then set up your local machine for development as follows:
 37 | 
 38 | - create a `Python virtual environment <https://docs.python.org/3/tutorial/venv.html>`_ – make sure that the Python version you're using for this is supported by tmtoolkit
 39 | - update pip via ``pip install -U pip``
 40 | - if you're planning to contribute to the code or to the tutorials in the documentation:
 41 | 
 42 |   - install *all* dependencies via ``pip install -r requirements.txt``
 43 |   - run the tmtoolkit setup routine via ``python -m tmtoolkit setup all`` to install the required language models
 44 |   - check that everything works by running all tests via ``pytest tests/``
 45 | 
 46 | - if you're *only* planning to contribute to the documentation (without the tutorials which are Jupyter Notebooks):
 47 | 
 48 |   - install dependencies for documentation via ``pip install -r requirements_doc.txt``
 49 | 
 50 | You can then start working on the code or documentation. Make sure to run the tests and/or create new tests when you provide code updates in your pull request. You should also read this developer documentation completely before diving into the code.
 51 | 
 52 | 
 53 | Folder structure
 54 | ----------------
 55 | 
 56 | The project's root folder contains files for documentation generation (``.readthedocs.yaml``), testing (``conftest.py``, ``coverage.svg``, ``tox.ini``) as well as project management and package building (``Makefile``, ``MANIFEST.in``, ``setup.py``). The subfolders include:
 57 | 
 58 | - ``.github/worflows``: provides Continuous Integration (CI) configuration for *GitHub Actions*,
 59 | - ``doc``: documentation source and built documentation files,
 60 | - ``examples``: example scripts and data to show some of the features (most features are better explained in the tutorial which is part of the documentation),
 61 | - ``scripts``: scripts used for preparing datasets that come along with the package,
 62 | - ``tests``: test suite,
 63 | - ``tmtoolkit``: package source code.
 64 | 
 65 | 
 66 | Packaging and dependency management
 67 | -----------------------------------
 68 | 
 69 | This package uses `setuptools <https://setuptools.pypa.io/en/latest/index.html>`_ for packaging. All package metadata and dependencies are defined in ``setup.py``. Since tmtoolkit allows installing dependencies on demand, there are several installation options defined in ``setup.py``. For development, the most important are:
 70 | 
 71 | - ``[dev]``: installs packages for development and packaging
 72 | - ``[test]``: installs packages for testing tmtoolkit
 73 | - ``[doc]``: installs packages for generating the documentation
 74 | - ``[all]``: installs all required and optional packages – recommended for development
 75 | 
 76 | The ``requirements.txt`` and ``requirements_doc.txt`` files simply point to the ``[all]`` and ``[doc]`` installation options.
 77 | 
 78 | The ``Makefile`` in the root folder contains targets for generating a Python *Wheel* package (``make wheel``) and a Python source distribution package (``make sdist``).
 79 | 
 80 | 
 81 | Built-in datasets
 82 | -----------------
 83 | 
 84 | All built-in datasets reside in ``tmtoolkit/data/<LANGUAGE_CODE>``, where ``LANGUAGE_CODE`` is an ISO language code. For the `ParlSpeech V2 <https://doi.org/10.7910/DVN/L4OAKN>`_ datasets, the samples are generated via the R script ``scripts/prepare_corpora.R``. The `News Articles <https://doi.org/10.7910/DVN/GMFCTR>`_ dataset is used without further processing.
 85 | 
 86 | 
 87 | Automated testing
 88 | -----------------
 89 | 
 90 | The tmtoolkit package relies on the following packages for testing:
 91 | 
 92 | - `pytest <https://pytest.org/>`_ as testing framework,
 93 | - `hypothesis <https://hypothesis.readthedocs.io/>`_ for property-based testing,
 94 | - `coverage <https://coverage.readthedocs.io/>`_ for measuring test coverage of the code,
 95 | - `tox <https://tox.wiki/>`_ for checking packaging and running tests in different virtual environments.
 96 | 
 97 | All tests are implemented in the ``tests`` directory and prefixed by ``test_``. The ``conftest.py`` file contains project-wide test configuration. The ``tox.ini`` file contains configuration for setting up the virtual environments for tox. For each release, tmtoolkit aims to support the last three major Python release versions, e.g. 3.8, 3.9 and 3.10, and all of these are tested with tox along with different dependency configurations from *minimal* to *full*. To use different versions of Python on the same system, it's recommended to use the `deadsnakes repository <https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa>`_ on Ubuntu or Debian Linux.
 98 | 
 99 | The ``Makefile`` in the root folder contains a target for generating coverage reports and the coverage badge (``make cov_tests``).
100 | 
101 | 
102 | Documentation
103 | -------------
104 | 
105 | The `Sphinx <https://www.sphinx-doc.org/>`_ package is used for documentation. All objects exposed by the API are documented in the Sphinx format. All other parts of the documentation reside in ``doc/source``. The configuration for Sphinx lies in ``doc/source/conf.py``. The `nbsphinx <https://nbsphinx.readthedocs.io/>`_ package is used for generating the tutorial from Jupyter Notebooks which are also located in ``doc/source``.
106 | 
107 | The ``Makefile`` in the ``doc`` folder has several targets for generating the documentation. These are:
108 | 
109 | - ``make notebooks`` – run all notebooks to generate their outputs; these are stored in-place
110 | - ``make clean`` – remove everything under ``doc/build``
111 | - ``make html`` – generate the HTML documentation from the documentation source
112 | 
113 | The generated documentation then resides under ``doc/build``.
114 | 
115 | The documentation is published at `tmtoolkit.readthedocs.io <https://tmtoolkit.readthedocs.io/en/latest/>`_. For this, new commits to the master branch of the GitHub project or new tags are automatically built by `readthedocs.org <https://readthedocs.org/>`_. The ``.readthedocs.yaml`` file in the root folder sets up the build process for readthedocs.org.
116 | 
117 | 
118 | Continuous integration
119 | ----------------------
120 | 
121 | Continuous integration routines are defined via `GitHub Actions (GA) <https://docs.github.com/en/actions>`_. For tmtoolkit, this so far only means automatic testing for new commits and releases on different machine configurations.
122 | 
123 | The GA set up for the tests is done in ``.github/worflows/runtests.yml``. There are "minimal" and "full" test suites for Ubuntu, MacOS and Windows with Python versions 3.8, 3.9 and 3.10 each, which means 18 jobs are spawned. Again, tox is used for running the tests on the machines.
124 | 
125 | 
126 | Release management
127 | ------------------
128 | 
129 | Publishing a new release for tmtoolkit involves several steps, listed below. You may consider creating a `pre-release <https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#pre-release-versioning>`_ for PyPI first before publishing a final release.
130 | 
131 | 1. Preparation:
132 | 
133 | - create a new branch for the release version X.Y.Z as ``releaseX.Y.Z``
134 | - check if there are new minimum version requirements for dependencies or generally new dependencies to be added in ``setup.py``
135 | - check if the compatible Python versions should be updated in ``setup.py``
136 | - set the new version in ``setup.py`` and ``tmtoolkit/__init__.py``
137 | 
138 | 2. Documentation updates:
139 | 
140 | - check and possibly update the tutorials – do all code examples still work and are all important features covered?
141 | - update documentation
142 | - update README
143 | - update changelog (``doc/source/version_history.rst``)
144 | 
145 | 3. Testing:
146 | 
147 | - run examples and check if they work
148 | - run tests locally via tox
149 | - push to GitHub repository ``develop`` or ``release*`` branch to run tests via GitHub Actions
150 | 
151 | 4. Publish package to PyPI:
152 | 
153 | - build source distribution via ``make sdist``
154 | - build wheel via ``make wheel``
155 | - check both via ``twine check dist/...``
156 | - if checks passed, upload both to PyPI via ``twine upload dist/...``
157 | 
158 | 5. Finalization
159 | 
160 | - make a new tag for the new version via ``git tag -a vX.Y.Z -m "version X.Y.Z"``
161 | - push the new tag to the GitHub repository
162 | - create a new release from the tag in the GitHub repository
163 | - merge the development or release branch with the master branch and push the master branch to the GitHub repository
164 | - log in to `readthedocs.org <https://readthedocs.org/>`_, go to the project page, activate the current version, let it build the documentation
165 | - verify documentation on `tmtoolkit.readthedocs.io <https://tmtoolkit.readthedocs.io/en/latest/>`_
166 | 
167 | If you notice a (major) mistake in a release *after* publication, you have several options like yanking the release on PyPI, publishing a post-release or updating the build number of the wheel. See `this blog post <https://snarky.ca/what-to-do-when-you-botch-a-release-on-pypi/>`_ for more information about these options.
168 | 
169 | 
170 | API style
171 | ---------
172 | 
173 | The tmtoolkit package provides a *functional API*. This is quite different from object-oriented APIs that are found in many other Python packages, where a programmer mainly uses classes and their methods that are exposed by an API. The tmtoolkit API on the other hand mainly exposes data structures and functions that operate on these data structures. In tmtoolkit, Python classes are usually used to implement more complex data structures such as documents or document corpora, but these classes don't provide (public) methods. Rather, they are used as function arguments, for example as in the large set of *corpus functions* that operate on text corpora as explained below.
174 | 
175 | 
176 | Implementation details
177 | ----------------------
178 | 
179 | Top-level module and setup routine
180 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
181 | 
182 | The ``__main__.py`` file provides a command-line interface for the package. It's only purpose is to allow easy installation of SpaCy language models via the :ref:`setup routine <setup>`. The ``tokenseq`` module provides functions that operate on single (string) tokens or sequences of tokens. These functions are used mainly internally in the ``corpus`` module, but are also exposed by the API to be used from a package user. The ``utils.py`` module provides helper functions used internally throughout the package, but also to be possibly used from a package user.
183 | 
184 | ``bow`` module
185 | ^^^^^^^^^^^^^^
186 | 
187 | This module provides functions for generating document-term-matrices (DTMs), which are central to the BoW concept, and some common statistics used for these matrices.
188 | 
189 | ``corpus`` module
190 | ^^^^^^^^^^^^^^^^^
191 | 
192 | This is the central module for text processing and text mining.
193 | 
194 | At the core of this module, there is the :class:`~tmtoolkit.corpus.Corpus` class implemented in ``corpus/_corpus.py``. It takes documents with raw text as input (i.e. a dict mapping *document labels* to text strings) and applies a SpaCy NLP pipeline to it. After that, the corpus consists of  :class:`~tmtoolkit.corpus.Document` (implemented in ``corpus/_document.py``) objects which contain the textual data in tokenized form, i.e. as a sequence of *tokens* (roughly translated as "words" but other text contents such as numbers and punctuation also form separate tokens). Each token comes along with several *token attributes* which were estimated using the NLP pipeline. Examples for token attributes include the Part-of-Speech tag or the lemma.
195 | 
196 | The :class:`~tmtoolkit.corpus.Document` class stores the tokens and their "standard" attributes in a *token matrix*. This matrix is of shape *(N, M)* for *N* tokens and with *M* attributes. There are at least 2 or 3 attributes: ``whitespace`` (boolean – is there a whitespace after the token?), ``token`` (the actual token, i.e. "word" type) and optionally ``sent_start`` (only given when sentence information is parsed in the NLP pipeline).
197 | 
198 | The token matrix is a *uint64* matrix as it stores all information as *64 bit hash values*. Compared to sequences of strings, this reduces memory usage and allows faster computations and data modifications. E.g., when you transform a token (lets say "Hello" to "hello"), you only do one transformation, calculate one new hash value and replace each occurrence of the old hash with the new hash. The hashes are calculated with SpaCy's `hash_string <https://spacy.io/api/stringstore#hash_string>`_ function. For fast conversion between token/attribute hashes and strings, the mappings are stored in a *bidirectional dictionary* using the `bidict <https://pypi.org/project/bidict/>`_ package. Each column, i.e. each attribute, in the token matrix has a separate bidict in the  ``bimaps`` dictionary that is shared between a corpus and each Document object. Using bidict proved to be *much* faster than using SpaCy's built in `Vocab / StringStore <https://spacy.io/api/stringstore>`_.
199 | 
200 | Besides "standard" token attributes that come from the SpaCy NLP pipeline, a user may also add custom token attributes. These are stored in each document's :attr:`~tmtoolkit.corpus.Document.custom_token_attrs` dictionary that map a attribute name to a NumPy array. These arrays are of arbitrary type and don't use the hashing approach. Besides token attributes, there are also *document attributes*. These are attributes attached to each document, for example the *document label* (unique document identifier). Custom document attributes can be added, e.g. to record the publication year of a document. Document attributes can also be of any type and are not hashed.
201 | 
202 | The :class:`~tmtoolkit.corpus.Corpus` class implements a data structure for text corpora with named documents. All these documents are stored in the corpus as :class:`~tmtoolkit.corpus.Document` objects. *Corpus functions* allow to operate on Corpus objects. They are implemented in ``corpus/_corpusfuncs.py``. All corpus functions that transform/modify a corpus, have an ``inplace`` argument, by default set to ``True``. If  ``inplace`` is set to ``True``, the corpus will be directly modified in-place, i.e. modifying the input corpus. If ``inplace`` is set to ``False``, a copy of the input corpus is created and all modifications are applied to this copy. The original input corpus is not altered in that case. The ``corpus_func_inplace_opt`` decorator is used to mark corpus functions with the in-place option.
203 | 
204 | The :class:`~tmtoolkit.corpus.Corpus` class provides parallel processing capabilities for processing large data amounts. This can be controlled with the ``max_workers`` argument. Parallel processing is then enabled at two stages: First, it is simply enabled for the SpaCy NLP pipeline by setting up the pipeline accordingly. Second, a *reusable process pool executor* is created by the means of `loky <https://github.com/joblib/loky/>`_. This process pool is then used in corpus functions whenever parallel execution is beneficial over serial execution. The ``parallelexec`` decorator is used to mark (inner) functions for parallel execution.
205 | 
206 | 
207 | ``topicmod`` module
208 | ^^^^^^^^^^^^^^^^^^^
209 | 
210 | This is the central module for computing, evaluating and analyzing topic models.
211 | 
212 | In ``topicmod/evaluate.py`` there are mainly several evaluation metrics for topic models implemented. Topic models can be computed and evaluated in parallel, the base code for that is in ``topicmod/parallel.py``. Three modules use the base classes from ``topicmod/parallel.py`` to implement interfaces to popular topic modeling packages:
213 | 
214 | - ``topicmod/tm_gensim.py`` for `gensim <https://radimrehurek.com/gensim/>`_
215 | - ``topicmod/tm_lda.py`` for `lda <http://pythonhosted.org/lda/>`_
216 | - ``topicmod/tm_sklearn.py`` for `scikit-learn <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html>`_
217 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. tmtoolkit documentation master file, created by
 2 |    sphinx-quickstart on Tue Aug 27 11:30:06 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. include:: intro.rst
 7 | 
 8 | .. include:: license_note.rst
 9 | 
10 | .. toctree::
11 |    :maxdepth: 4
12 |    :caption: Contents:
13 | 
14 |    install
15 |    getting_started
16 |    text_corpora
17 |    preprocessing
18 |    bow
19 |    topic_modeling
20 |    api
21 |    development
22 |    version_history
23 | 
24 | Indices and tables
25 | ==================
26 | 
27 | * :ref:`genindex`
28 | * :ref:`modindex`
29 | * :ref:`search`
30 | 


--------------------------------------------------------------------------------
/doc/source/install.rst:
--------------------------------------------------------------------------------
  1 | .. _install:
  2 | 
  3 | Installation
  4 | ============
  5 | 
  6 | Requirements
  7 | ------------
  8 | 
  9 | **tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).**
 10 | 
 11 | Requirements are automatically installed via *pip* as described below. Additional packages can also be installed
 12 | via *pip* for certain use cases (see :ref:`optional_packages`).
 13 | 
 14 | 
 15 | Installation instructions
 16 | -------------------------
 17 | 
 18 | The package *tmtoolkit* is available on `PyPI <https://pypi.org/project/tmtoolkit/>`_ and can be installed via
 19 | Python package manager *pip*. It is highly recommended to install tmtoolkit and its dependencies in a separate
 20 | `Python Virtual Environment ("venv") <https://docs.python.org/3/tutorial/venv.html>`_ and upgrade to the latest
 21 | *pip* version (you may also choose to install
 22 | `virtualenvwrapper <https://virtualenvwrapper.readthedocs.io/en/latest/>`_, which makes managing venvs a lot
 23 | easier).
 24 | 
 25 | Creating and activating a venv *without* virtualenvwrapper:
 26 | 
 27 | .. code-block:: text
 28 | 
 29 |     python3 -m venv myenv
 30 | 
 31 |     # activating the environment (on Windows type "myenv\Scripts\activate.bat")
 32 |     source myenv/bin/activate
 33 | 
 34 | Alternatively, creating and activating a venv *with* virtualenvwrapper:
 35 | 
 36 | .. code-block:: text
 37 | 
 38 |     mkvirtualenv myenv
 39 | 
 40 |     # activating the environment
 41 |     workon myenv
 42 | 
 43 | Upgrading pip (*only* do this when you've activated your venv):
 44 | 
 45 | .. code-block:: text
 46 | 
 47 |     pip install -U pip
 48 | 
 49 | The tmtoolkit package is highly modular and tries to install as few software dependencies as possible. So in order to
 50 | install tmtoolkit, you can first choose if you want a minimal installation or install a recommended set of
 51 | packages that enable most features. For the recommended installation, you can type **one of the following**, depending
 52 | on the preferred package for topic modeling:
 53 | 
 54 | .. code-block:: text
 55 | 
 56 |     # recommended installation without topic modeling
 57 |     pip install -U "tmtoolkit[recommended]"
 58 | 
 59 |     # recommended installation with "lda" for topic modeling
 60 |     pip install -U "tmtoolkit[recommended,lda]"
 61 | 
 62 |     # recommended installation with "scikit-learn" for topic modeling
 63 |     pip install -U "tmtoolkit[recommended,sklearn]"
 64 | 
 65 |     # recommended installation with "gensim" for topic modeling
 66 |     pip install -U "tmtoolkit[recommended,gensim]"
 67 | 
 68 |     # you may also select several topic modeling packages
 69 |     pip install -U "tmtoolkit[recommended,lda,sklearn,gensim]"
 70 | 
 71 | The **minimal** installation will only install a base set of dependencies and will only enable the modules for BoW
 72 | statistics, token sequence operations, topic modeling and utility functions. You can install it as follows:
 73 | 
 74 | .. code-block:: text
 75 | 
 76 |     # alternative installation if you only want to install a minimum set of dependencies
 77 |     pip install -U tmtoolkit
 78 | 
 79 | .. note:: The tmtoolkit package is about 7MB big, because it contains some example corpora.
 80 | 
 81 | .. _setup:
 82 | 
 83 | **After that, you should initially run tmtoolkit's setup routine.** This makes sure that all required data files are
 84 | present and downloads them if necessary. You should specify a list of languages for which language models should be
 85 | downloaded and installed. The list of available language models corresponds with the models provided by
 86 | `SpaCy <https://spacy.io/usage/models#languages>`_ (except for "multi-language"). You need to specify the two-letter ISO
 87 | language code for the language models that you want to install. **Don't use spaces in the list of languages.**
 88 | E.g. in order to install models for English and German:
 89 | 
 90 | .. code-block:: text
 91 | 
 92 |     python -m tmtoolkit setup en,de
 93 | 
 94 | To install *all* available language models, you can run:
 95 | 
 96 | .. code-block:: text
 97 | 
 98 |     python -m tmtoolkit setup all
 99 | 
100 | .. _optional_packages:
101 | 
102 | Optional packages
103 | -----------------
104 | 
105 | For additional features, you can install further packages using the following installation options:
106 | 
107 | - ``pip install -U tmtoolkit[textproc_extra]`` for Unicode normalization and simplification and for stemming with *nltk*
108 | - ``pip install -U tmtoolkit[wordclouds]`` for generating word clouds
109 | - ``pip install -U tmtoolkit[lda]`` for topic modeling with LDA
110 | - ``pip install -U tmtoolkit[sklearn]`` for topic modeling with scikit-learn
111 | - ``pip install -U tmtoolkit[gensim]`` for topic modeling and additional evaluation metrics with Gensim
112 | - ``pip install -U tmtoolkit[topic_modeling_eval_extra]`` for topic modeling evaluation metrics ``griffiths_2004`` and
113 |   ``held_out_documents_wallach09`` (see further information below)
114 | 
115 | For LDA evaluation metrics ``griffiths_2004`` and ``held_out_documents_wallach09`` it is necessary to install
116 | `gmpy2 <https://github.com/aleaxit/gmpy>`_ for multiple-precision arithmetic. This in turn requires installing some C
117 | header libraries for GMP, MPFR and MPC. On Debian/Ubuntu systems this is done with:
118 | 
119 | .. code-block:: text
120 | 
121 |     sudo apt install libgmp-dev libmpfr-dev libmpc-dev
122 | 


--------------------------------------------------------------------------------
/doc/source/intro.rst:
--------------------------------------------------------------------------------
  1 | tmtoolkit: Text mining and topic modeling toolkit
  2 | =================================================
  3 | 
  4 | |pypi| |pypi_downloads| |rtd| |runtests| |coverage| |zenodo|
  5 | 
  6 | *tmtoolkit* is a set of tools for text mining and topic modeling with Python developed especially for the use in the
  7 | social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation
  8 | and a clear programming interface while offering good performance on large datasets by the means of vectorized
  9 | operations (via NumPy) and parallel computation (using Python's *multiprocessing* module and the
 10 | `loky <https://loky.readthedocs.io/>`_ package). The basis of tmtoolkit's text mining capabilities are built around
 11 | `SpaCy <https://spacy.io/>`_, which offers a `many language models <https://spacy.io/models>`_. Currently,
 12 | the following languages are supported for text mining:
 13 | 
 14 | - Catalan
 15 | - Chinese
 16 | - Danish
 17 | - Dutch
 18 | - English
 19 | - French
 20 | - German
 21 | - Greek
 22 | - Italian
 23 | - Japanese
 24 | - Lithuanian
 25 | - Macedonian
 26 | - Norwegian Bokmål
 27 | - Polish
 28 | - Portuguese
 29 | - Romanian
 30 | - Russian
 31 | - Spanish
 32 | 
 33 | The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org <https://tmtoolkit.readthedocs.org>`_ and
 34 | the GitHub code repository is on
 35 | `github.com/WZBSocialScienceCenter/tmtoolkit <https://github.com/WZBSocialScienceCenter/tmtoolkit>`_.
 36 | 
 37 | Features
 38 | --------
 39 | 
 40 | Text preprocessing and text mining
 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 42 | 
 43 | The tmtoolkit package offers several text preprocessing and text mining methods, including:
 44 | 
 45 | - `tokenization, sentence segmentation, part-of-speech (POS) tagging, named-entity recognition (NER) <text_corpora.ipynb#Configuring-the-NLP-pipeline,-parallel-processing-and-more-via-Corpus-parameters>`_ (via SpaCy)
 46 | - `lemmatization and token normalization <preprocessing.ipynb#Lemmatization-and-token-normalization>`_
 47 | - extensive `pattern matching capabilities <preprocessing.ipynb#Common-parameters-for-pattern-matching-functions>`_
 48 |   (exact matching, regular expressions or "glob" patterns) to be used in many
 49 |   methods of the package, e.g. for filtering on token or document level, or for
 50 |   `keywords-in-context (KWIC) <preprocessing.ipynb#Keywords-in-context-(KWIC)-and-general-filtering-methods>`_
 51 | - adding and managing
 52 |   `custom document and token attributes <preprocessing.ipynb#Working-with-document-and-token-attributes>`_
 53 | - accessing text corpora along with their
 54 |   `document and token attributes as dataframes <preprocessing.ipynb#Accessing-tokens-and-token-attributes>`_
 55 | - calculating and `visualizing corpus summary statistics <preprocessing.ipynb#Visualizing-corpus-summary-statistics>`_
 56 | - finding out and joining `collocations <preprocessing.ipynb#Identifying-and-joining-token-collocations>`_
 57 | - `splitting and sampling corpora <text_corpora.ipynb#Corpus-functions-for-document-management>`_
 58 | - generating `n-grams <preprocessing.ipynb#Generating-n-grams>`_
 59 | - generating `sparse document-term matrices <preprocessing.ipynb#Generating-a-sparse-document-term-matrix-(DTM)>`_
 60 | 
 61 | Wherever possible and useful, these methods can operate in parallel to speed up computations with large datasets.
 62 | 
 63 | Topic modeling
 64 | ^^^^^^^^^^^^^^
 65 | 
 66 | - `model computation in parallel <topic_modeling.ipynb#Computing-topic-models-in-parallel>`_ for different copora
 67 |   and/or parameter sets
 68 | - support for `lda <http://pythonhosted.org/lda/>`_,
 69 |   `scikit-learn <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html>`_
 70 |   and `gensim <https://radimrehurek.com/gensim/>`_ topic modeling backends
 71 | - `evaluation of topic models <topic_modeling.ipynb#Evaluation-of-topic-models>`_ (e.g. in order to an optimal number
 72 |   of topics for a given dataset) using several implemented metrics:
 73 | 
 74 |    - model coherence (`Mimno et al. 2011 <https://dl.acm.org/citation.cfm?id=2145462>`_) or with
 75 |      `metrics implemented in Gensim <https://radimrehurek.com/gensim/models/coherencemodel.html>`_)
 76 |    - KL divergence method (`Arun et al. 2010 <http://doi.org/10.1007/978-3-642-13657-3_43>`_)
 77 |    - probability of held-out documents (`Wallach et al. 2009 <https://doi.org/10.1145/1553374.1553515>`_)
 78 |    - pair-wise cosine distance method (`Cao Juan et al. 2009 <http://doi.org/10.1016/j.neucom.2008.06.011>`_)
 79 |    - harmonic mean method (`Griffiths, Steyvers 2004 <http://doi.org/10.1073/pnas.0307752101>`_)
 80 |    - the loglikelihood or perplexity methods natively implemented in lda, sklearn or gensim
 81 | 
 82 | - `plotting of evaluation results <topic_modeling.ipynb#Evaluation-of-topic-models>`_
 83 | - `common statistics for topic models <topic_modeling.ipynb#Common-statistics-and-tools-for-topic-models>`_ such as
 84 |   word saliency and distinctiveness (`Chuang et al. 2012 <https://dl.acm.org/citation.cfm?id=2254572>`_), topic-word
 85 |   relevance (`Sievert and Shirley 2014 <https://www.aclweb.org/anthology/W14-3110>`_)
 86 | - `finding / filtering topics with pattern matching <topic_modeling.ipynb#Filtering-topics>`_
 87 | - `export estimated document-topic and topic-word distributions to Excel
 88 |   <topic_modeling.ipynb#Displaying-and-exporting-topic-modeling-results>`_
 89 | - `visualize topic-word distributions and document-topic distributions <topic_modeling.ipynb#Visualizing-topic-models>`_
 90 |   as word clouds or heatmaps
 91 | - model coherence (`Mimno et al. 2011 <https://dl.acm.org/citation.cfm?id=2145462>`_) for individual topics
 92 | - integrate `PyLDAVis <https://pyldavis.readthedocs.io/en/latest/>`_ to visualize results
 93 | 
 94 | 
 95 | Other features
 96 | ^^^^^^^^^^^^^^
 97 | 
 98 | - loading and cleaning of raw text from
 99 |   `text files, tabular files (CSV or Excel), ZIP files or folders <text_corpora.ipynb#Loading-text-data>`_
100 | - `splitting and joining documents <text_corpora.ipynb#Corpus-functions-for-document-management>`_
101 | - `common statistics and transformations for document-term matrices <bow.ipynb>`_ like word cooccurrence and *tf-idf*
102 | 
103 | 
104 | Limits
105 | ------
106 | 
107 | - only languages are supported, for which `SpaCy language models <https://spacy.io/models>`_ are available
108 | - all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example
109 |   `Gensim <https://radimrehurek.com/gensim/>`_ supports)
110 | 
111 | 
112 | Built-in datasets
113 | -----------------
114 | 
115 | Currently tmtoolkit comes with the following built-in datasets which can be loaded via
116 | :meth:`~tmtoolkit.corpus.Corpus.from_builtin_corpus`:
117 | 
118 | - *"en-NewsArticles"*: `News Articles <https://doi.org/10.7910/DVN/GMFCTR>`_
119 |   *(Dai, Tianru, 2017, "News Articles", https://doi.org/10.7910/DVN/GMFCTR, Harvard Dataverse, V1)*
120 | - random samples from `ParlSpeech V2 <https://doi.org/10.7910/DVN/L4OAKN>`_
121 |   *(Rauh, Christian; Schwalbach, Jan, 2020, "The ParlSpeech V2 data set: Full-text corpora of 6.3 million parliamentary speeches in the key legislative chambers of nine representative democracies", https://doi.org/10.7910/DVN/L4OAKN, Harvard Dataverse)* for different languages:
122 | 
123 |    - *"de-parlspeech-v2-sample-bundestag"*
124 |    - *"en-parlspeech-v2-sample-houseofcommons"*
125 |    - *"es-parlspeech-v2-sample-congreso"*
126 |    - *"nl-parlspeech-v2-sample-tweedekamer"*
127 | 
128 | 
129 | About this documentation
130 | ------------------------
131 | 
132 | This documentation guides you in several chapters from installing tmtoolkit to its specific use cases and shows some
133 | examples with built-in corpora and other datasets. All "hands on" chapters from
134 | `Getting started <getting_started.ipynb>`_ to `Topic modeling <topic_modeling.ipynb>`_ are generated from
135 | `Jupyter Notebooks <https://jupyter.org/>`_. If you want to follow along using these notebooks, you can download them
136 | from the `GitHub repository <https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source>`_.
137 | 
138 | There are also a few other examples as plain Python scripts available in the
139 | `examples folder <https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/examples>`_ of the GitHub repository.
140 | 
141 | 
142 | .. |pypi| image:: https://badge.fury.io/py/tmtoolkit.svg
143 |     :target: https://badge.fury.io/py/tmtoolkit
144 |     :alt: PyPI Version
145 | 
146 | .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/tmtoolkit
147 |     :target: https://pypi.org/project/tmtoolkit/
148 |     :alt: Downloads from PyPI
149 | 
150 | .. |runtests| image:: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml/badge.svg
151 |     :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml
152 |     :alt: GitHub Actions CI Build Status
153 | 
154 | .. |coverage| image:: https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/master/coverage.svg?sanitize=true
155 |     :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/tests
156 |     :alt: Coverage status
157 | 
158 | .. |rtd| image:: https://readthedocs.org/projects/tmtoolkit/badge/?version=latest
159 |     :target: https://tmtoolkit.readthedocs.io/en/latest/?badge=latest
160 |     :alt: Documentation Status
161 | 
162 | .. |zenodo| image:: https://zenodo.org/badge/109812180.svg
163 |     :target: https://zenodo.org/badge/latestdoi/109812180
164 |     :alt: Citable Zenodo DOI
165 | 


--------------------------------------------------------------------------------
/doc/source/license_note.rst:
--------------------------------------------------------------------------------
1 | License
2 | =======
3 | 
4 | Code licensed under `Apache License 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
5 | See `LICENSE <https://github.com/WZBSocialScienceCenter/tmtoolkit/blob/master/LICENSE>`_ file.
6 | 
7 | 


--------------------------------------------------------------------------------
/doc/source/version_history.rst:
--------------------------------------------------------------------------------
  1 | .. _changes:
  2 | 
  3 | Version history
  4 | ===============
  5 | 
  6 | 0.11.2 - 2022-03-11
  7 | -------------------
  8 | 
  9 | - updated `Arun et al. 2010 <http://doi.org/10.1007/978-3-642-13657-3_43>`_ topic model evaluation metric to better follow the paper's instructions instead of the implementation adapted from the *ldatuning* package (see `this discussion <https://github.com/nikita-moor/ldatuning/issues/7>`_ – many thanks to `@hkimber <https://github.com/hkimber>`_)
 10 | - updated `Mimno et al. 2011 <https://dl.acm.org/citation.cfm?id=2145462>`_ topic model evaluation metric's default argument values to be the same as used in the original paper; added an optional argument to include word probabilities into the calculations
 11 | - added an example with topic model evaluation for the `AP and NIPS datasets <http://archive.ics.uci.edu/ml/datasets/Bag+of+Words>`_
 12 | - added a `developer documentation <https://tmtoolkit.readthedocs.io/en/latest/development.html>`_
 13 | 
 14 | 0.11.1 - 2022-02-10
 15 | -------------------
 16 | 
 17 | - show better error messages when dependencies for optional module ``corpus`` are not met
 18 | - fix a SciPy deprecation warning
 19 | 
 20 | 0.11.0 - 2022-02-08
 21 | -------------------
 22 | 
 23 | This release brings several major API changes to the text loading, text preprocessing and text mining parts of
 24 | tmtoolkit. All these features are now in a single sub-module, ``corpus``. This module contains a ``Corpus`` class which
 25 | holds ``Document`` objects. All text processing and text mining operations can be performed on ``Corpus`` objects. These
 26 | operations are implemented as a functional API in the ``corpus`` sub-module.
 27 | 
 28 | It is advisable to re-install tmtoolkit in a new virtual environment following the
 29 | :ref:`installation instructions <install>`. Make sure to run ``python -m tmtoolkit setup <LANGUAGES>``, where
 30 | ``<LANGUAGES>`` is a list of language codes like ``en,fr``.
 31 | 
 32 | Further changes include:
 33 | 
 34 | - added new functions for identifying and joining token collocations
 35 | - added new functions for visualizing corpus summary statistics
 36 | - added new function ``find_documents``
 37 | - added new text normalization functions ``normalize_unicode``, ``simplify_unicode``, ``numbers_to_magnitudes``
 38 | - added support for sentences
 39 | - added support for using all SpaCy token attributes
 40 | - added common ``select`` argument for many text processing/mining functions to operate only on a subset of documents
 41 | - added common ``as_table`` argument for many text processing/mining functions to operate to convert the result to a
 42 |   (sorted) dataframe
 43 | - added common ``proportions`` argument for many text processing/mining functions to convert resulting frequencies to
 44 |   proportions or log proportions
 45 | - added common ``inplace`` argument for many text processing/mining functions to either transform a corpus in-place or
 46 |   return a transformed copy
 47 | - added 6 new languages now supported by SpaCy (Catalan, Danish, Macedonian, Polish, Romanian, Russian)
 48 | - added new function ``corpus_join_documents`` for joining documents
 49 | - added option for calculating log probabilities or proportions
 50 | - fixed log probability calculations for higher precision in BoW statistics and topic model evaluation functions
 51 | - dependencies for text processing and text mining are now optional
 52 | - added function for easier logging: ``enable_logging``
 53 | - moved all functions that operate on string or numeric sequences to ``tokenseq`` sub-module
 54 | - all glob patterns now use ``EXACT`` flag
 55 | - added type annotations for ``corpus``, ``tokenseq`` and ``utils`` sub-modules
 56 | - updated dependencies (only SpaCy 3.2 or higher is now supported)
 57 | - updated minimum Python requirements (Python 3.8 or higher)
 58 | - removed datatable support
 59 | 
 60 | 
 61 | 0.10.0 - 2020-08-03
 62 | -------------------
 63 | 
 64 | This release marks a switch from NLTK to `SpaCy <https://spacy.io/>`_ for text preprocessing tasks. With this change,
 65 | much more languages are supported (see `this list <https://spacy.io/models>`_). It is advisable to re-install tmtoolkit
 66 | in a new virtual environment following the :ref:`installation instructions <install>`. Make sure to run
 67 | ``python -m tmtoolkit setup <LANGUAGES>``, where ``<LANGUAGES>`` is a list of language codes like ``en,fr``.
 68 | 
 69 | Further changes:
 70 | 
 71 | * added support for word and document vectors via SpaCy
 72 | * added built-in datasets available via ``Corpus`` class
 73 | * added ``ldamodel_top_word_topics`` and ``ldamodel_top_topic_docs`` functions
 74 | * added new filter functions and options for ``TMPreproc``
 75 | * made stemming function optional (only available when NLTK is installed)
 76 | * run DTM generation in parallel
 77 | * updated dependencies
 78 | * restructured tests
 79 | 
 80 | 
 81 | 0.9.0 - 2019-12-20
 82 | ------------------
 83 | 
 84 | * added usage and API documentation
 85 | * added support for Arun 2010 metric in `tm_gensim` (thx to @mcooper)
 86 | * added support for `datatable package <https://github.com/h2oai/datatable/>`_
 87 | * added functional API for text preprocessing
 88 | * added KWIC in text preprocessing
 89 | * added post-installation setup routine to download necessary data files
 90 | * added built-in corpora
 91 | * added `sorted_terms` and `sorted_terms_data_table` to `bow_stats`
 92 | * added `glue_tokens` function
 93 | * retain sparse matrices in several `bow_stats` functions such as tfidf
 94 | * corpus module: loading of CSV and ZIP files, added several other new methods
 95 | * faster `get_dtm` (now works in parallel)
 96 | * `filter_tokens` / `filter_documents` accept multiple patterns at once
 97 | * lots of (partly **breaking**) changes and speed improvements in `TMPreproc`
 98 | * fixed error with `ignore_case` being ignored in `token_match` for regex and glob
 99 | * integrate tox
100 | * use Numpy extras for hypothesis tests
101 | * compatibility with Python 3.6, 3.7 and 3.8
102 | 
103 | 
104 | 0.8.0 - 2019-02-05
105 | ------------------
106 | 
107 | * faster package and sub-module import
108 | * remove support for Python 2.7 (now only Python 3.5 and higher is supported)
109 | * use `germalemma package <https://pypi.org/project/germalemma/>`_
110 | * use importlib instead of deprecated imp
111 | * fix problem with not installing all required packages
112 | 
113 | 
114 | 0.7.3 - 2018-09-17 (last release to support Python 2.7)
115 | -------------------------------------------------------
116 | 
117 | * new options in `corpus` module for converting Windows linebreaks to Unix linebreaks
118 | 
119 | 0.7.2 - 2018-07-23
120 | ------------------
121 | 
122 | * new option for `exclude_topics`: `return_new_topic_mapping`
123 | * fixed `issue #7 <https://github.com/WZBSocialScienceCenter/tmtoolkit/issues/7>`_ (results entry about model gets overwritten)
124 | 
125 | 0.7.1 - 2018-06-18
126 | ------------------
127 | 
128 | * fix stupid missing import
129 | 
130 | 0.7.0 - 2018-06-18
131 | ------------------
132 | 
133 | * added sub-package `bow` with functions for DTM creation and statistics
134 | * fixed problems with evaluation and parallel calculation of gensim models (#5)
135 | * added Gensim evaluation example
136 | 
137 | 0.6.3 - 2018-06-01
138 | ------------------
139 | 
140 | * made `get_vocab_and_terms` more memory-efficient
141 | * updated requirements (fixes #6)
142 | 
143 | 0.6.2 - 2018-04-27
144 | ------------------
145 | 
146 | * added new function `exclude_topics` to `model_stats`
147 | 
148 | 0.6.1 - 2018-04-27
149 | ------------------
150 | 
151 | * better figure title placement, grouped subplots and other improvements in `plot_eval_results`
152 | * bugfix in `model_stats` due to missing unicode literals
153 | 
154 | 0.6.0 - 2018-04-25
155 | ------------------
156 | 
157 | * **API restructured: (uninstall package first when upgrading!)**
158 |   * sub-package `lda_utils` is now called `topicmod`
159 |   * no more `common` module in `topicmod` -> divided into `evaluate` (including evaluation metrics from former `eval_metrics`), `model_io`, `model_stats`, and `parallel`
160 | * added coherence metrics `PR #2 <https://github.com/WZBSocialScienceCenter/tmtoolkit/pull/2>`_
161 |   * implemented modified coherence metric according to Mimno et al. 2011 as `metric_coherence_mimno_2011`
162 |   * added wrapper function for coherence model provided by Gensim as `metric_coherence_gensim`
163 | * added evaluation metric with probability of held-out documents in cross-validation (see `metric_held_out_documents_wallach09`)
164 | * added new example for topic model coherence
165 | * updated examples
166 | 
167 | 0.5.0 - 2018-02-13
168 | ------------------
169 | 
170 | * add `doc_paths` field to `Corpus`
171 | * change `plot_eval_results` to show individual metrics' results as subplots – **function signature changed!**
172 | 
173 | 0.4.2 - 2018-02-06
174 | ------------------
175 | 
176 | * made greedy partitioning much more efficient (i.e. faster work distribution)
177 | * added package information variables
178 | * added this CHANGES document :)
179 | 
180 | 0.4.1 - 2018-01-24
181 | ------------------
182 | 
183 | * fixed bug in `lda_utils.common.ldamodel_full_doc_topics`
184 | * added `topic_labels` for doc-topic heatmap
185 | * minor documentation fixes
186 | 
187 | 0.4.0 - 2018-01-18
188 | ------------------
189 | 
190 | * improved parameter checks for `TMPreproc.filter_for_pos`
191 | * improved tests for `TMPreproc.filter_for_pos`
192 | * fixed broken test in Python 2.x
193 | * added `generate_topic_labels_from_top_words`
194 | * speed up in `top_n_from_distribution`
195 | * added relevance score calculation (Sievert et al 2014)
196 | * added functions to get most/least distinctive words
197 | * added saliency calculation
198 | * allow to define axis labels and plot title in `plot_eval_results`
199 | 
200 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as 
4 | part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from
5 | the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them
6 | on your computer.
7 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | tmtoolkit – examples
3 | 
4 | Markus Konrad <markus.konrad@wzb.eu>
5 | """
6 | 


--------------------------------------------------------------------------------
/examples/_benchmarktools.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | 
 4 | timings = []
 5 | timing_labels = []
 6 | 
 7 | 
 8 | def add_timing(label):
 9 |     timings.append(datetime.today())
10 |     timing_labels.append(label)
11 | 
12 | 
13 | def print_timings():
14 |     print('timings:')
15 |     t_sum = 0
16 |     prev_t = None
17 |     for i, (t, label) in enumerate(zip(timings, timing_labels)):
18 |         if i > 0:
19 |             t_delta = (t - prev_t).total_seconds()
20 |             print('%s: %.2f sec' % (label, t_delta))
21 |             t_sum += t_delta
22 | 
23 |         prev_t = t
24 | 
25 |     print('total: %.2f sec' % t_sum)
26 | 


--------------------------------------------------------------------------------
/examples/benchmark_en_newsarticles.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Benchmarking script that loads and processes English language test corpus with Corpus in parallel.
 3 | 
 4 | This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed an
 5 | English language model for spaCy:
 6 | 
 7 |     pip install -U "tmtoolkit[recommended]"
 8 |     python -m tmtoolkit setup en
 9 | 
10 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
11 | 
12 | To benchmark whole script with `time` from command line run:
13 | 
14 |     PYTHONPATH=.. /usr/bin/time -v python benchmark_en_newsarticles.py [NUMBER OF WORKERS]
15 | """
16 | 
17 | import sys
18 | import logging
19 | 
20 | from tmtoolkit.corpus import Corpus, doc_tokens, vocabulary, dtm, lemmatize, to_lowercase, filter_clean_tokens
21 | 
22 | from examples._benchmarktools import add_timing, print_timings
23 | 
24 | logging.basicConfig(level=logging.INFO)
25 | tmtoolkit_log = logging.getLogger('tmtoolkit')
26 | tmtoolkit_log.setLevel(logging.INFO)
27 | tmtoolkit_log.propagate = True
28 | 
29 | if len(sys.argv) > 1:
30 |     max_workers = int(sys.argv[1])
31 | else:
32 |     max_workers = 1
33 | 
34 | print(f'max workers: {max_workers}')
35 | 
36 | #%%
37 | 
38 | add_timing('start')
39 | 
40 | docs = Corpus.from_builtin_corpus('en-NewsArticles', language='en', max_workers=max_workers)
41 | print(str(docs))
42 | 
43 | #%%
44 | 
45 | add_timing('load and tokenize')
46 | 
47 | toks = doc_tokens(docs)
48 | add_timing('doc_tokens')
49 | 
50 | toks_w_attrs = doc_tokens(docs, with_attr=True)
51 | add_timing('doc_tokens with attributes')
52 | 
53 | vocab = vocabulary(docs)
54 | add_timing('vocabulary')
55 | 
56 | lemmatize(docs)
57 | add_timing('lemmatize')
58 | 
59 | to_lowercase(docs)
60 | add_timing('to_lowercase')
61 | 
62 | filter_clean_tokens(docs)
63 | add_timing('filter_clean_tokens')
64 | 
65 | dtm_ = dtm(docs)
66 | add_timing('sparse_dtm')
67 | 
68 | print_timings()
69 | 


--------------------------------------------------------------------------------
/examples/bundestag18_tfidf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example script that loads and processes the proceedings of the 18th German Bundestag and generates a tf-idf matrix.
  3 | The data is quite large, consisting of 15,733 documents with 14,355,341 tokens in total. This script shows how to
  4 | handle large data efficiently by using the parallel processing power of tmtoolkit and sparse matrix calculations
  5 | that use few memory.
  6 | 
  7 | Note that it is highly advisable to run this script section by section (denoted with "#%%" or even line by line in an
  8 | interactive Python interpreter in order to see the effects of each code block.
  9 | 
 10 | The data for the debates comes from offenesparlament.de, see https://github.com/Datenschule/offenesparlament-data.
 11 | 
 12 | This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed a
 13 | German language model for spaCy:
 14 | 
 15 |     pip install -U "tmtoolkit[recommended]"
 16 |     python -m tmtoolkit setup de
 17 | 
 18 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
 19 | 
 20 | Markus Konrad <markus.konrad@wzb.eu>
 21 | June 2019 / Feb. 2022
 22 | """
 23 | 
 24 | import re
 25 | import pickle
 26 | import string
 27 | import random
 28 | from pprint import pprint
 29 | from zipfile import ZipFile
 30 | 
 31 | from tmtoolkit import corpus as c
 32 | from tmtoolkit.corpus import visualize as cvis
 33 | from tmtoolkit.tokenseq import unique_chars
 34 | from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table
 35 | from tmtoolkit.utils import enable_logging, pickle_data, unpickle_file
 36 | import matplotlib.pyplot as plt
 37 | import numpy as np
 38 | import pandas as pd
 39 | 
 40 | pd.set_option('display.width', 140)
 41 | pd.set_option('display.max_columns', 100)
 42 | 
 43 | #%% Optional: set up output log for tmtoolkit
 44 | 
 45 | enable_logging()
 46 | 
 47 | #%% Load the data inside the zip file
 48 | 
 49 | print('loading data from zip file')
 50 | 
 51 | with ZipFile('data/bt18_full.zip') as bt18zip:
 52 |     # there is a pickled pandas data frame inside the zipfile
 53 |     # extract it and load it
 54 |     bt18pickle = bt18zip.read('bt18_speeches_merged.pickle')
 55 |     bt18_data = pickle.loads(bt18pickle)
 56 | 
 57 | # we don't need this anymore, remove it to free memory
 58 | del bt18pickle, bt18zip
 59 | 
 60 | #%% Generate document labels
 61 | 
 62 | # format of the document labels: <session_number>_<speech_number>
 63 | bt18_data['doc_label'] = ['%s_%s' % (str(sitzung).zfill(3), str(seq).zfill(5))
 64 |                           for sitzung, seq in zip(bt18_data.sitzung, bt18_data.sequence)]
 65 | 
 66 | print('loaded data frame with %d rows:' % bt18_data.shape[0])
 67 | print(bt18_data.head())
 68 | 
 69 | bt18_texts = dict(zip(bt18_data.doc_label, bt18_data.text))
 70 | del bt18_data
 71 | 
 72 | 
 73 | #%% Prepare raw text data preprocessing
 74 | 
 75 | # remove some special characters
 76 | 
 77 | corpus_chars = unique_chars(bt18_texts.values())
 78 | print('special characters in text data:')
 79 | pprint(sorted(corpus_chars - set(string.printable)))
 80 | 
 81 | keepchars = set('óéıàŁŽńôíśžê̆č€şćÖÇ₂ãüÀÄłÜšěŠźēûçÉöáåúèäßëğîǧҫČœřïñ§°')
 82 | delchars = corpus_chars - set(string.printable) - keepchars
 83 | print(f'will remove characters: {delchars}')
 84 | 
 85 | delchars_table = str.maketrans('', '', ''.join(delchars))
 86 | 
 87 | # we will pass this function as "raw_preproc" function
 88 | def del_special_chars(t):
 89 |     return t.translate(delchars_table)
 90 | 
 91 | # some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen"
 92 | # correct this by applying a custom function with a regular expression (RE) to each document in the corpus
 93 | pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)')
 94 | 
 95 | # in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m`
 96 | # we will pass this function as "raw_preproc" function
 97 | def correct_contractions(t):
 98 |     return pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t)
 99 | 
100 | 
101 | # correct hyphenation issues in the documents like "groß-zügig"
102 | # we will pass this function as "raw_preproc" function
103 | pttrn_hyphenation = re.compile(r'([a-zäöüß])-([a-zäöüß])')
104 | def correct_hyphenation(t):
105 |     return pttrn_hyphenation.sub(lambda m: m.group(1) + m.group(2), t)
106 | 
107 | 
108 | #%% Generate a Corpus object
109 | 
110 | 
111 | # we use the column "doc_label" as document labels and "text" as raw text
112 | print('creating corpus object')
113 | corpus = c.Corpus(bt18_texts, language='de', max_workers=1.0,
114 |                   raw_preproc=[del_special_chars, correct_contractions, correct_hyphenation])
115 | 
116 | # we don't need this anymore, remove it to free memory
117 | del bt18_texts
118 | 
119 | c.print_summary(corpus)
120 | 
121 | #%% storing a Corpus object
122 | 
123 | # at any time, we may store a Corpus object to disk via `save_corpus_to_picklefile` and later load it
124 | # via `load_corpus_from_picklefile`; this helps you to prevent long running computations again
125 | 
126 | # c.save_corpus_to_picklefile(corpus, 'data/bt18_corpus.pickle')
127 | # corpus = load_corpus_from_picklefile('data/bt18_corpus.pickle')
128 | 
129 | #%% Have a look at the vocabulary of the whole corpus
130 | print('vocabulary:')
131 | pprint(c.vocabulary(corpus))
132 | 
133 | print(f'\nvocabulary contains {c.vocabulary_size(corpus)} tokens')
134 | 
135 | #%% Display a keywords-in-context (KWIC) table
136 | 
137 | print('keywords-in-context (KWIC) table for keyword "Merkel":')
138 | print(c.kwic_table(corpus, 'Merkel'))
139 | 
140 | #%% Text normalization
141 | 
142 | # lemmatization
143 | c.lemmatize(corpus)
144 | 
145 | # convert all tokens to lowercase and apply several "cleaning" methods
146 | print('applying further token normalization')
147 | c.to_lowercase(corpus)
148 | c.filter_clean_tokens(corpus)
149 | c.remove_tokens(corpus, r'^-.+', match_type='regex')
150 | 
151 | print('vocabulary:')
152 | pprint(c.vocabulary(corpus))
153 | 
154 | print(f'\nvocabulary contains {c.vocabulary_size(corpus)} tokens')
155 | 
156 | # there are still some stray tokens which should be removed:
157 | c.remove_tokens(corpus, ['+40', '+', '.plädieren'])
158 | 
159 | #%% Let's have a look at the most frequent tokens
160 | 
161 | print('retrieving document frequencies for all tokens in the vocabulary')
162 | c.vocabulary_counts(corpus, proportions=1, as_table='-freq').head(50)
163 | 
164 | # the rank - count plot shows quite a deviation from Zipf's law, because we already applied some token normalization
165 | fig, ax = plt.subplots()
166 | cvis.plot_ranked_vocab_counts(fig, ax, corpus, zipf=True)
167 | plt.show()
168 | 
169 | #%% Further token cleanup
170 | 
171 | # we can remove tokens above a certain threshold of (relative or absolute) document frequency
172 | c.remove_common_tokens(corpus, df_threshold=0.8)
173 | 
174 | # since we'll later use tf-idf, removing very common or very uncommon tokens may not even be necessary; however
175 | # it reduces the computation time and memory consumption of all downstream tasks
176 | 
177 | #%% Document lengths (number of tokens per document)
178 | 
179 | fig, ax = plt.subplots()
180 | cvis.plot_doc_lengths_hist(fig, ax, corpus)
181 | plt.show()
182 | 
183 | 
184 | #%% Let's have a look at very short documents
185 | 
186 | docsizes = c.doc_lengths(corpus, as_table='length')
187 | 
188 | # document labels of documents with lesser or equal 30 tokens
189 | doc_labels_short = docsizes.doc[docsizes.length <= 30]
190 | doc_labels_short_texts = c.doc_texts(corpus, select=doc_labels_short, collapse=' ')
191 | 
192 | print(f'{len(doc_labels_short)} documents with lesser or equal 30 tokens:')
193 | for lbl, txt in doc_labels_short_texts.items():
194 |     print(lbl)
195 |     pprint(txt)
196 |     print('---')
197 | 
198 | 
199 | #%% Remove very short documents
200 | 
201 | print('removing documents with lesser or equal 30 tokens')
202 | c.remove_documents_by_label(corpus, doc_labels_short.to_list())
203 | 
204 | 
205 | #%% Another keywords-in-context (KWIC) table
206 | 
207 | print('keywords-in-context (KWIC) table for keyword "merkel" with normalized tokens:')
208 | print(c.kwic_table(corpus, 'merkel'))
209 | 
210 | #%% Create a document-term-matrix (DTM)
211 | 
212 | # this creates a sparse DTM where the matrix rows correspond to the current document labels and the
213 | # matrix columns correspond to the current vocabulary
214 | # the calculations take several minutes, even when they're performed in parallel
215 | 
216 | print('creating document-term-matrix (DTM)')
217 | dtm = c.dtm(corpus)
218 | 
219 | print('matrix created:')
220 | print(repr(dtm))
221 | 
222 | doc_labels = np.array(c.doc_labels(corpus))
223 | vocab = np.array(c.vocabulary(corpus))
224 | 
225 | 
226 | #%% Saving / loading a DTM
227 | 
228 | # again, you may store the DTM along with the document labels and vocabulary to disk to later load it again:
229 | 
230 | # pickle_data((dtm, doc_labels, vocab), 'data/bt18_dtm.pickle')
231 | # dtm, doc_labels, vocab = unpickle_file('data/bt18_dtm.pickle')
232 | 
233 | 
234 | #%% Computing a tf-idf matrix
235 | 
236 | # we can apply tf-idf to the DTM
237 | # the result will remain a sparse matrix, hence it doesn't allocate much memory
238 | 
239 | print('computing a tf-idf matrix from the DTM')
240 | tfidf_mat = tfidf(dtm)
241 | print('matrix created:')
242 | print(repr(tfidf_mat))
243 | 
244 | #%% Investigating the top tokens of the tf-idf transformed matrix
245 | 
246 | # this will create a data frame of the 20 most "informative" (tf-idf-wise) tokens per document
247 | top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=20)
248 | 
249 | random_doc = random.choice(doc_labels)
250 | print(f'20 most "informative" (tf-idf high ranked) tokens in randomly chosen document "{random_doc}":')
251 | 
252 | print(top_tokens[top_tokens.index.get_level_values(0) == random_doc])
253 | 


--------------------------------------------------------------------------------
/examples/data/ap.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/ap.pickle


--------------------------------------------------------------------------------
/examples/data/bt18_full.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/bt18_full.zip


--------------------------------------------------------------------------------
/examples/data/bt18_sample_1000.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/bt18_sample_1000.pickle


--------------------------------------------------------------------------------
/examples/data/gensim_evaluation_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/gensim_evaluation_plot.png


--------------------------------------------------------------------------------
/examples/data/nips.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/nips.pickle


--------------------------------------------------------------------------------
/examples/gensim_evaluation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example for topic modeling evaluation with gensim.
 3 | 
 4 | Please note that this is just an example for showing how to perform topic model evaluation with Gensim. The
 5 | preprocessing of the data is just done quickly and probably not the best way for the given data.
 6 | 
 7 | This examples requires that you have installed tmtoolkit with the recommended set of packages plus Gensim and have
 8 | installed a German language model for spaCy:
 9 | 
10 |     pip install -U "tmtoolkit[recommended,gensim]"
11 |     python -m tmtoolkit setup de
12 | 
13 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
14 | 
15 | """
16 | 
17 | 
18 | import matplotlib.pyplot as plt
19 | import gensim
20 | import pandas as pd
21 | 
22 | from tmtoolkit import corpus as c
23 | from tmtoolkit.topicmod import tm_gensim
24 | from tmtoolkit.utils import pickle_data, enable_logging
25 | from tmtoolkit.topicmod.evaluate import results_by_parameter
26 | from tmtoolkit.topicmod.visualize import plot_eval_results
27 | 
28 | #%%
29 | 
30 | enable_logging()
31 | 
32 | #%% loading data
33 | 
34 | print('loading data...')
35 | bt18 = pd.read_pickle('data/bt18_sample_1000.pickle')
36 | print('loaded %d documents' % len(bt18))
37 | doc_labels = ['%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)]
38 | 
39 | #%%
40 | 
41 | print('loading and tokenizing documents')
42 | # minimal pipeline
43 | bt18corp = c.Corpus(dict(zip(doc_labels, bt18.text)), language='de', load_features=[], max_workers=1.0)
44 | del bt18
45 | c.print_summary(bt18corp)
46 | 
47 | print('preprocessing data...')
48 | 
49 | c.stem(bt18corp)
50 | c.filter_clean_tokens(bt18corp)
51 | 
52 | c.print_summary(bt18corp)
53 | 
54 | #%%
55 | 
56 | print('creating gensim corpus...')
57 | 
58 | texts = list(c.doc_tokens(bt18corp).values())
59 | gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
60 | gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]
61 | 
62 | del bt18corp
63 | 
64 | #%%
65 | 
66 | # evaluate topic models with different parameters
67 | const_params = dict(update_every=0, passes=10)
68 | ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
69 | varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]
70 | 
71 | print(f'evaluating {len(varying_params)} topic models')
72 | eval_results = tm_gensim.evaluate_topic_models((gnsm_dict, gnsm_corpus), varying_params, const_params,
73 |                                                coherence_gensim_texts=texts)   # necessary for coherence C_V metric
74 | 
75 | # save the results as pickle
76 | print('saving results')
77 | pickle_data(eval_results, 'data/gensim_evaluation_results.pickle')
78 | 
79 | # plot the results
80 | print('plotting evaluation results')
81 | plt.style.use('ggplot')
82 | results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
83 | plot_eval_results(results_by_n_topics, xaxislabel='num. topics k',
84 |                   title='Evaluation results', figsize=(8, 6))
85 | plt.savefig('data/gensim_evaluation_plot.png')
86 | plt.show()
87 | 


--------------------------------------------------------------------------------
/examples/minimal_tfidf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A minimal example to showcase a few features of tmtoolkit.
 3 | 
 4 | Markus Konrad <markus.konrad@wzb.eu>
 5 | Feb. 2022
 6 | """
 7 | 
 8 | from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm
 9 | from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table
10 | 
11 | 
12 | # load built-in sample dataset and use 4 worker processes
13 | corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)
14 | 
15 | # investigate corpus as dataframe
16 | toktbl = tokens_table(corp)
17 | print(toktbl)
18 | 
19 | # apply some text normalization
20 | lemmatize(corp)
21 | to_lowercase(corp)
22 | 
23 | # build sparse document-token matrix (DTM)
24 | # document labels identify rows, vocabulary tokens identify columns
25 | mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)
26 | 
27 | # apply tf-idf transformation to DTM
28 | # operation is applied on sparse matrix and uses few memory
29 | tfidf_mat = tfidf(mat)
30 | 
31 | # show top 5 tokens per document ranked by tf-idf
32 | top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)
33 | print(top_tokens)
34 | 


--------------------------------------------------------------------------------
/examples/topicmod_ap_nips_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Topic model evaluation for AP and NIPS datasets (http://archive.ics.uci.edu/ml/datasets/Bag+of+Words).
 3 | 
 4 | This examples requires that you have installed tmtoolkit with the "lda" package.
 5 | 
 6 |     pip install -U "tmtoolkit[lda]"
 7 | 
 8 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
 9 | 
10 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
11 | """
12 | 
13 | import os
14 | import sys
15 | 
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 | 
19 | from tmtoolkit.utils import unpickle_file, enable_logging
20 | from tmtoolkit.topicmod.tm_lda import evaluate_topic_models, AVAILABLE_METRICS
21 | from tmtoolkit.topicmod.evaluate import results_by_parameter
22 | from tmtoolkit.topicmod.visualize import plot_eval_results
23 | 
24 | 
25 | #%%
26 | 
27 | if len(sys.argv) != 5:
28 |     print('req. args: dataset, number of workers, eta, alpha numerator')
29 |     exit(1)
30 | 
31 | dataset = sys.argv[1]
32 | n_workers = int(sys.argv[2])
33 | eta = float(sys.argv[3])
34 | alpha_numerator = float(sys.argv[4])
35 | 
36 | print(f'dataset: {dataset}, workers: {n_workers}, beta: {eta}, alpha numerator: {alpha_numerator}')
37 | 
38 | dataset_short = os.path.basename(dataset)[:-7]
39 | 
40 | #%%
41 | 
42 | enable_logging()
43 | 
44 | #%%
45 | 
46 | print('loading data...')
47 | 
48 | doc_labels, vocab, dtm = unpickle_file(dataset)
49 | doc_labels = np.asarray(doc_labels)
50 | vocab = np.asarray(vocab)
51 | 
52 | #%%
53 | 
54 | print('running evaluations...')
55 | 
56 | const_params = {
57 |     'n_iter': 1500,
58 |     'eta': eta,
59 |     'random_state': 20220105  # to make results reproducible
60 | }
61 | 
62 | var_params = [{'n_topics': k, 'alpha': alpha_numerator/k}
63 |               for k in list(range(20, 201, 2))]
64 | 
65 | metrics = ['arun_2010', 'cao_juan_2009', 'coherence_mimno_2011']
66 | 
67 | if 'griffiths_2004' in AVAILABLE_METRICS:
68 |     metrics.append('griffiths_2004')
69 | 
70 | eval_results = evaluate_topic_models(dtm,
71 |                                      varying_parameters=var_params,
72 |                                      constant_parameters=const_params,
73 |                                      return_models=False,
74 |                                      metric=metrics,
75 |                                      n_max_processes=n_workers)
76 | 
77 | #%%
78 | 
79 | print('plotting evaluations...')
80 | 
81 | eval_by_topics = results_by_parameter(eval_results, 'n_topics')
82 | plot_eval_results(eval_by_topics,
83 |                   title=f'Evaluation results for {dataset_short}\nalpha={alpha_numerator}/K, beta={eta:.4}')
84 | 
85 | #plt.show()
86 | plt.savefig(f'data/topicmod_evaluate_{dataset_short}_{eta:.4}.png')
87 | 
88 | print('done.')
89 | 


--------------------------------------------------------------------------------
/examples/topicmod_lda.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An example for topic modeling with LDA with focus on the new plotting functions in `tmtoolkit.corpus.visualize` and
  3 | in `tmtoolkit.topicmod.visualize`.
  4 | 
  5 | This examples requires that you have installed tmtoolkit with the recommended set of packages plus "lda" and have
  6 | installed an English language model for spaCy:
  7 | 
  8 |     pip install -U "tmtoolkit[recommended,lda]"
  9 |     python -m tmtoolkit setup en
 10 | 
 11 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
 12 | 
 13 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 14 | """
 15 | 
 16 | import os.path
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | 
 20 | from tmtoolkit.utils import enable_logging, pickle_data, unpickle_file
 21 | from tmtoolkit.corpus import Corpus, lemmatize, to_lowercase, remove_punctuation, remove_common_tokens, \
 22 |     remove_uncommon_tokens, filter_clean_tokens, print_summary, remove_documents_by_length, dtm, \
 23 |     corpus_retokenize, save_corpus_to_picklefile, load_corpus_from_picklefile
 24 | from tmtoolkit.corpus.visualize import plot_doc_lengths_hist, plot_doc_frequencies_hist, plot_vocab_counts_hist, \
 25 |     plot_ranked_vocab_counts, plot_num_sents_hist, plot_sent_lengths_hist, plot_num_sents_vs_sent_length, \
 26 |     plot_token_lengths_hist
 27 | from tmtoolkit.topicmod.tm_lda import evaluate_topic_models    # we're using lda for topic modeling
 28 | from tmtoolkit.topicmod.evaluate import results_by_parameter
 29 | from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
 30 | from tmtoolkit.topicmod.visualize import plot_eval_results, plot_topic_word_ranked_prob, plot_doc_topic_ranked_prob
 31 | 
 32 | #%%
 33 | 
 34 | enable_logging()
 35 | 
 36 | #%% loading the sample corpus (English news articles)
 37 | 
 38 | corp_picklefile = 'data/topicmod_lda_corpus.pickle'
 39 | 
 40 | if os.path.exists(corp_picklefile):
 41 |     docs = load_corpus_from_picklefile(corp_picklefile)
 42 | else:
 43 |     docs = Corpus.from_builtin_corpus('en-NewsArticles', max_workers=1.0)
 44 |     save_corpus_to_picklefile(docs, corp_picklefile)
 45 | 
 46 | print_summary(docs)
 47 | 
 48 | 
 49 | #%% plot some corpus summary statistics
 50 | 
 51 | # you can copy those and also do the plotting also after corpus transformations in the next cell
 52 | # this shows you nicely how the transformations change the distribution of words in the corpus
 53 | 
 54 | fig, ax = plt.subplots()
 55 | plot_doc_lengths_hist(fig, ax, docs)
 56 | plt.show()
 57 | 
 58 | fig, ax = plt.subplots()
 59 | plot_vocab_counts_hist(fig, ax, docs)
 60 | plt.show()
 61 | 
 62 | fig, ax = plt.subplots()
 63 | plot_ranked_vocab_counts(fig, ax, docs, zipf=True)
 64 | plt.show()
 65 | 
 66 | fig, ax = plt.subplots()
 67 | plot_doc_frequencies_hist(fig, ax, docs)
 68 | plt.show()
 69 | 
 70 | fig, ax = plt.subplots()
 71 | plot_num_sents_hist(fig, ax, docs)
 72 | plt.show()
 73 | 
 74 | fig, ax = plt.subplots()
 75 | plot_sent_lengths_hist(fig, ax, docs)
 76 | plt.show()
 77 | 
 78 | fig, ax = plt.subplots()
 79 | plot_num_sents_vs_sent_length(fig, ax, docs)
 80 | plt.show()
 81 | 
 82 | fig, ax = plt.subplots()
 83 | plot_token_lengths_hist(fig, ax, docs)
 84 | plt.show()
 85 | 
 86 | #%% apply preprocessing pipeline
 87 | 
 88 | corp_preproc_picklefile = 'data/topicmod_lda_corpus_preprocessed.pickle'
 89 | 
 90 | if os.path.exists(corp_preproc_picklefile):
 91 |     docs = load_corpus_from_picklefile(corp_preproc_picklefile)
 92 | else:
 93 |     remove_punctuation(docs)
 94 |     corpus_retokenize(docs)
 95 |     lemmatize(docs)
 96 |     to_lowercase(docs)
 97 |     filter_clean_tokens(docs, remove_numbers=True)
 98 |     remove_common_tokens(docs, df_threshold=0.90)
 99 |     remove_uncommon_tokens(docs, df_threshold=0.05)
100 |     remove_documents_by_length(docs, '<', 30)
101 | 
102 |     save_corpus_to_picklefile(docs, corp_preproc_picklefile)
103 | 
104 | print_summary(docs)
105 | 
106 | #%% generating the document-term matrix
107 | 
108 | dtm_picklefile = 'data/topicmod_lda_dtm.pickle'
109 | 
110 | if os.path.exists(dtm_picklefile):
111 |     bow_mat, doc_labels, vocab = unpickle_file(dtm_picklefile)
112 | else:
113 |     bow_mat, doc_labels, vocab = dtm(docs, return_doc_labels=True, return_vocab=True)
114 |     pickle_data((bow_mat, doc_labels, vocab), dtm_picklefile)
115 | 
116 | 
117 | 
118 | #%% running the evaluation
119 | 
120 | eval_res_picklefile = 'data/topicmod_lda_eval_res.pickle'
121 | 
122 | if os.path.exists(dtm_picklefile):
123 |     eval_results = unpickle_file(eval_res_picklefile)
124 | else:
125 |     const_params = {
126 |         'n_iter': 1500,
127 |         'eta': 0.3,
128 |         'random_state': 20220105  # to make results reproducible
129 |     }
130 | 
131 |     var_params = [{'n_topics': k, 'alpha': 10.0/k}
132 |                   for k in list(range(20, 101, 20)) + [125, 150, 175, 200, 250, 300]]
133 | 
134 |     metrics = ['cao_juan_2009', 'arun_2010', 'coherence_mimno_2011', 'griffiths_2004']
135 | 
136 |     eval_results = evaluate_topic_models(bow_mat,
137 |                                          varying_parameters=var_params,
138 |                                          constant_parameters=const_params,
139 |                                          return_models=True,
140 |                                          metric=metrics)
141 | 
142 |     pickle_data(eval_results, eval_res_picklefile)
143 | 
144 | #%% plotting evaluation results
145 | 
146 | eval_by_topics = results_by_parameter(eval_results, 'n_topics')
147 | plot_eval_results(eval_by_topics)
148 | 
149 | plt.show()
150 | 
151 | #%% selecting the model and printing the topics' most likely words
152 | 
153 | selected_model = dict(eval_by_topics)[200]['model']
154 | 
155 | print_ldamodel_topic_words(selected_model.topic_word_, vocab=vocab)
156 | 
157 | #%% investigating, how many "top words" sufficiently describe a topic
158 | 
159 | fig, ax = plt.subplots()
160 | plot_topic_word_ranked_prob(fig, ax, selected_model.topic_word_, n=40, log_scale=False,
161 |                             highlight=[4, 12, 32], alpha=0.025)
162 | 
163 | plt.show()
164 | 
165 | # -> about 5 to 10 words aggregate most of the probability per topic
166 | 
167 | #%% investigating, how many "top topics" sufficiently describe a document
168 | 
169 | fig, ax = plt.subplots()
170 | plot_doc_topic_ranked_prob(fig, ax, selected_model.doc_topic_, n=40, log_scale=False, highlight=list(range(4)),
171 |                            alpha=0.003)
172 | 
173 | plt.show()
174 | 
175 | # -> about 10 to 15 topics aggregate most of the probability per document
176 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # requirements.txt
 2 | #
 3 | # installs dependencies from ./setup.py, and the package itself,
 4 | # in editable mode for development
 5 | 
 6 | -e .[all]
 7 | 
 8 | # don't forget to install the language models too via:
 9 | # python -m tmtoolkit setup all
10 | 


--------------------------------------------------------------------------------
/requirements_doc.txt:
--------------------------------------------------------------------------------
1 | # requirements_doc.txt
2 | #
3 | # installs doc dependencies from ./setup.py, and the package itself,
4 | # in editable mode for development
5 | 
6 | -e .[doc]
7 | 


--------------------------------------------------------------------------------
/scripts/fulldata/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except these files
4 | !.gitignore
5 | !README.md
6 | 
7 | 


--------------------------------------------------------------------------------
/scripts/fulldata/README.md:
--------------------------------------------------------------------------------
 1 | This folder stores the full datasets from which the sample datasets for the built-in corpora in tmtoolkit are generated:
 2 | 
 3 | ```
 4 | Corp_Bundestag_V2.rds
 5 | Corp_Congreso_V2.rds
 6 | Corp_HouseOfCommons_V2.rds
 7 | Corp_TweedeKamer_V2.rds
 8 | ```
 9 | 
10 | Due to their size, they are not part of the repository. You can download the respective datasets from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/L4OAKN.
11 | 
12 | Furthermore, this folder contains the NIPS dataset (`vocab.nips.txt` and `docword.nips.txt`) which can be obtained from http://archive.ics.uci.edu/ml/datasets/Bag+of+Words.
13 | 


--------------------------------------------------------------------------------
/scripts/nips_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert NIPS data from http://archive.ics.uci.edu/ml/datasets/Bag+of+Words to sparse DTM format stored as pickle file.
 3 | 
 4 | Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | import numpy as np
 8 | from scipy.sparse import coo_matrix
 9 | from tmtoolkit.utils import pickle_data
10 | 
11 | 
12 | #%%
13 | 
14 | with open('fulldata/vocab.nips.txt') as f:
15 |     vocab = np.array([l.strip() for l in f.readlines() if l.strip()])
16 | 
17 | #%%
18 | 
19 | n_docs = None
20 | n_vocab = None
21 | n_nonzero = None
22 | entries = []
23 | row_ind = []
24 | col_ind = []
25 | 
26 | with open('fulldata/docword.nips.txt') as f:
27 |     for i, l in enumerate(f):
28 |         l = l.strip()
29 | 
30 |         if i < 3:
31 |             n = int(l)
32 |             if i == 0:
33 |                 n_docs = n
34 |             elif i == 1:
35 |                 n_vocab = n
36 |             elif i == 2:
37 |                 n_nonzero = n
38 |         else:
39 |             j, k, n = list(map(int, l.split()))
40 |             entries.append(n)
41 |             row_ind.append(j-1)   # convert to zero-based index
42 |             col_ind.append(k-1)   # convert to zero-based index
43 | 
44 | 
45 | assert len(vocab) == n_vocab
46 | assert len(entries) == len(row_ind) == len(col_ind) == n_nonzero
47 | 
48 | dtm = coo_matrix((entries, (row_ind, col_ind)), shape=(n_docs, n_vocab), dtype='int64')
49 | 
50 | doc_labels = np.fromiter((f'doc{str(i+1).zfill(4)}' for i in range(n_docs)), dtype='<U7', count=n_docs)
51 | 
52 | #%%
53 | 
54 | pickle_data((doc_labels, vocab, dtm), '../examples/data/nips.pickle')
55 | 


--------------------------------------------------------------------------------
/scripts/prepare_corpora.R:
--------------------------------------------------------------------------------
 1 | set.seed(20200511)
 2 | SAMPLE_N <- 1000
 3 | OUTPUT_PATH <- '../tmtoolkit/data/'
 4 | FILE_PREFIX <- 'parlspeech-v2-sample-'
 5 | 
 6 | sample_rows <- function(df, n) {
 7 |   rows <- sample(1:nrow(df), n)
 8 |   df[rows, ]
 9 | }
10 | 
11 | process_dataset <- function(datainfo) {
12 |   print(sprintf('processing dataset for language %s from %s (label %s)',
13 |                 datainfo[1], datainfo[2], datainfo[3]))
14 |   
15 |   df <- readRDS(datainfo[2])
16 |   head(df)
17 |   
18 |   df_sample <- sample_rows(df, SAMPLE_N)
19 |   
20 |   setwd('tmp')
21 |   csvfile <- paste0(datainfo[1], '.csv')
22 |   df_sample$parlspeech_row <- rownames(df_sample)
23 |   write.csv(df_sample, csvfile, row.names = FALSE)
24 |   zip(paste0('../', OUTPUT_PATH, datainfo[1], '/', FILE_PREFIX, datainfo[3], '.zip'), csvfile)
25 |   setwd('..')
26 |   
27 |   print('done')
28 | }
29 | 
30 | datasets <- list(
31 |   c('de', 'fulldata/Corp_Bundestag_V2.rds', 'bundestag'),
32 |   c('es', 'fulldata/Corp_Congreso_V2.rds', 'congreso'),
33 |   c('en', 'fulldata/Corp_HouseOfCommons_V2.rds', 'houseofcommons'),
34 |   c('nl', 'fulldata/Corp_TweedeKamer_V2.rds', 'tweedekamer')
35 | )
36 | 
37 | lapply(datasets, process_dataset)
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tmtoolkit setuptools based setup module
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | import os
 8 | from codecs import open
 9 | 
10 | from setuptools import setup, find_packages
11 | 
12 | __title__ = 'tmtoolkit'
13 | __version__ = '0.11.2'
14 | __author__ = 'Markus Konrad'
15 | __license__ = 'Apache License 2.0'
16 | 
17 | 
18 | GITHUB_URL = 'https://github.com/WZBSocialScienceCenter/tmtoolkit'
19 | 
20 | DEPS_BASE = ['numpy>=1.22.0', 'scipy>=1.7.0', 'globre>=0.1.5',
21 |              'pandas>=1.4.0', 'xlrd>=2.0.0', 'openpyxl>=3.0.0',
22 |              'matplotlib>=3.5.0']
23 | 
24 | DEPS_EXTRA = {
25 |     'textproc': ['spacy>=3.2.0', 'bidict>=0.21.0', 'loky>=3.0.0'],
26 |     'textproc_extra': ['PyICU>=2.8', 'nltk>=3.6.0'],
27 |     'wordclouds': ['wordcloud>=1.8.0,<1.9', 'Pillow>=9.0.0'],
28 |     'lda': ['lda>=2.0'],
29 |     'sklearn': ['scikit-learn>=1.0.0'],
30 |     'gensim': ['gensim>=4.1.0'],
31 |     'topic_modeling_eval_extra': ['gmpy2>=2.1.0'],
32 |     'test': ['pytest>=7.0.0', 'hypothesis>=6.36.0'],
33 |     'doc': ['Sphinx>=4.4.0', 'sphinx-rtd-theme>=1.0.0', 'nbsphinx>=0.8.0'],
34 |     'dev': ['coverage>=6.3', 'coverage-badge>=1.1.0', 'pytest-cov>=3.0.0', 'twine>=3.8.0',
35 |             'ipython>=8.0.0', 'jupyter>=1.0.0', 'notebook>=6.4.0', 'tox>=3.24.0', 'setuptools>=60.7.0'],
36 | }
37 | 
38 | # DEPS_EXTRA['minimal'] = DEPS_BASE   # doesn't work with extras_require and pip currently
39 | # see https://github.com/pypa/setuptools/issues/1139
40 | 
41 | DEPS_EXTRA['recommended'] = DEPS_EXTRA['textproc'] + DEPS_EXTRA['wordclouds']
42 | DEPS_EXTRA['all'] = []
43 | for k, deps in DEPS_EXTRA.items():
44 |     if k not in {'recommended', 'all'}:
45 |         DEPS_EXTRA['all'].extend(deps)
46 | 
47 | here = os.path.abspath(os.path.dirname(__file__))
48 | 
49 | # Get the long description from the README file
50 | with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:
51 |     long_description = f.read()
52 | 
53 | setup(
54 |     name=__title__,
55 |     version=__version__,
56 |     description='Text Mining and Topic Modeling Toolkit',
57 |     long_description=long_description,
58 |     long_description_content_type='text/x-rst',
59 |     url=GITHUB_URL,
60 |     project_urls={
61 |         'Bug Reports': GITHUB_URL + '/issues',
62 |         'Source': GITHUB_URL,
63 |     },
64 | 
65 |     author=__author__,
66 |     author_email='markus.konrad@wzb.eu',
67 | 
68 |     license=__license__,
69 | 
70 |     classifiers=[
71 |         'Development Status :: 4 - Beta',
72 |         'Intended Audience :: Science/Research',
73 |         'Intended Audience :: Developers',
74 |         'License :: OSI Approved :: Apache Software License',
75 | 
76 |         'Operating System :: OS Independent',
77 |         'Programming Language :: Python',
78 |         'Programming Language :: Python :: 3',
79 |         'Programming Language :: Python :: 3.8',
80 |         'Programming Language :: Python :: 3.9',
81 |         'Programming Language :: Python :: 3.10',
82 | 
83 |         'Topic :: Scientific/Engineering :: Information Analysis',
84 |         'Topic :: Software Development :: Libraries :: Python Modules',
85 |         'Topic :: Utilities',
86 |     ],
87 | 
88 |     keywords='textmining textanalysis text mining analysis preprocessing topicmodeling topic modeling evaluation',
89 | 
90 |     packages=find_packages(exclude=['tests', 'examples']),
91 |     include_package_data=True,
92 |     python_requires='>=3.8',
93 |     install_requires=DEPS_BASE,
94 |     extras_require=DEPS_EXTRA
95 | )
96 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | tmtoolkit – automated tests
3 | 
4 | Markus Konrad <markus.konrad@wzb.eu>
5 | """
6 | 


--------------------------------------------------------------------------------
/tests/_testtools.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | from hypothesis import strategies as st
 4 | from hypothesis.extra.numpy import arrays, array_shapes
 5 | 
 6 | 
 7 | def strategy_2d_array(dtype, minval=0, maxval=None, **kwargs):
 8 |     if 'min_side' in kwargs:
 9 |         min_side = kwargs.pop('min_side')
10 |     else:
11 |         min_side = 1
12 | 
13 |     if 'max_side' in kwargs:
14 |         max_side = kwargs.pop('max_side')
15 |     else:
16 |         max_side = None
17 | 
18 |     if dtype is int:
19 |         elems = st.integers(minval, maxval, **kwargs)
20 |     elif dtype is float:
21 |         elems = st.floats(minval, maxval, **kwargs)
22 |     elif dtype is str:
23 |         elems = st.text(min_size=minval, max_size=maxval, **kwargs)
24 |     else:
25 |         raise ValueError('no elements strategy for dtype', dtype)
26 | 
27 |     return arrays(dtype, array_shapes(min_dims=2, max_dims=2, min_side=min_side, max_side=max_side), elements=elems)
28 | 
29 | 
30 | def strategy_dtm():
31 |     return strategy_2d_array(int, 0, 10000)
32 | 
33 | 
34 | def strategy_dtm_small():
35 |     return strategy_2d_array(int, 0, 10, min_side=2, max_side=6)
36 | 
37 | 
38 | def strategy_2d_prob_distribution():
39 |     return strategy_2d_array(float, 0, 1, allow_nan=False, allow_infinity=False)
40 | 
41 | 
42 | def strategy_tokens(*args, **kwargs):
43 |     return st.lists(st.text(*args, **kwargs))
44 | 
45 | 
46 | def strategy_lists_of_tokens(*args, **kwargs):
47 |     return st.lists(st.lists(st.text(*args, **kwargs)))
48 | 
49 | 
50 | def strategy_texts(*args, **kwargs):
51 |     return st.lists(st.text(*args, **kwargs))
52 | 
53 | 
54 | def strategy_texts_printable():
55 |     return strategy_texts(string.printable)
56 | 
57 | 
58 | def strategy_str_str_dict(keys_args, keys_kwargs, values_args, values_kwargs):
59 |     return st.dictionaries(st.text(*keys_args, **keys_kwargs), st.text(*values_args, **values_kwargs))
60 | 
61 | 
62 | def strategy_str_str_dict_printable():
63 |     return st.dictionaries(st.text(string.printable), st.text(string.printable))
64 | 


--------------------------------------------------------------------------------
/tests/data/.gitignore:
--------------------------------------------------------------------------------
1 | test_pickle_unpickle*
2 | 


--------------------------------------------------------------------------------
/tests/data/100NewsArticles.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/100NewsArticles.xlsx


--------------------------------------------------------------------------------
/tests/data/3ExampleDocs.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/3ExampleDocs.xlsx


--------------------------------------------------------------------------------
/tests/data/tiny_model_reuters_5_topics.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/tiny_model_reuters_5_topics.pickle


--------------------------------------------------------------------------------
/tests/data/zipdata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/zipdata.zip


--------------------------------------------------------------------------------
/tests/test_corpusimport.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for importing optional tmtoolkit.corpus module.
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | from importlib.util import find_spec
 8 | 
 9 | import pytest
10 | 
11 | 
12 | def test_import_corpus():
13 |     if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')):
14 |         with pytest.raises(RuntimeError, match='^the required package'):
15 |             from tmtoolkit import corpus
16 |         with pytest.raises(RuntimeError, match='^the required package'):
17 |             from tmtoolkit.corpus import Corpus
18 |     else:
19 |         from tmtoolkit import corpus
20 |         from tmtoolkit.corpus import Corpus
21 |         import spacy
22 |         import bidict
23 |         import loky
24 | 


--------------------------------------------------------------------------------
/tests/test_topicmod__eval_tools.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from scipy.sparse import coo_matrix, issparse
 3 | from hypothesis import given, strategies as st
 4 | 
 5 | from ._testtools import strategy_dtm
 6 | 
 7 | from tmtoolkit.topicmod._eval_tools import split_dtm_for_cross_validation
 8 | 
 9 | 
10 | @given(
11 |     dtm=strategy_dtm(),
12 |     matrix_type=st.integers(min_value=0, max_value=1),
13 |     n_folds=st.integers(min_value=0, max_value=20)
14 | )
15 | def test_split_dtm_for_cross_validation(dtm, matrix_type, n_folds):
16 |     if matrix_type == 1:
17 |         dtm = coo_matrix(dtm)
18 | 
19 |     if n_folds < 2 or n_folds > dtm.shape[0]:
20 |         with pytest.raises(ValueError):
21 |             next(split_dtm_for_cross_validation(dtm, n_folds))
22 |     else:
23 |         n_docs, n_vocab = dtm.shape
24 | 
25 |         n_generated_folds = 0
26 |         for fold, train_dtm, test_dtm in split_dtm_for_cross_validation(dtm, n_folds):
27 |             assert 0 <= fold < n_folds
28 | 
29 |             if matrix_type == 1:
30 |                 assert issparse(train_dtm)
31 |                 assert issparse(test_dtm)
32 | 
33 |             assert train_dtm.ndim == test_dtm.ndim == 2
34 | 
35 |             assert train_dtm.shape[0] >= test_dtm.shape[0]
36 |             assert 0 < test_dtm.shape[0] <= n_docs // n_folds
37 |             assert train_dtm.shape[0] + test_dtm.shape[0] == n_docs
38 |             assert train_dtm.shape[1] == test_dtm.shape[1] == n_vocab
39 | 
40 |             n_generated_folds += 1
41 | 
42 |         assert n_folds == n_generated_folds
43 | 


--------------------------------------------------------------------------------
/tests/test_topicmod_model_io.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import tempfile
  3 | from collections import OrderedDict
  4 | 
  5 | import pytest
  6 | from hypothesis import given, strategies as st, settings
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from ._testtools import strategy_2d_prob_distribution
 12 | 
 13 | from tmtoolkit.topicmod import model_io
 14 | 
 15 | 
 16 | def test_save_load_ldamodel_pickle():
 17 |     try:
 18 |         import lda
 19 |     except ImportError:
 20 |         pytest.skip('lda not installed')
 21 | 
 22 |     pfile = os.path.join('tests', 'data', 'test_pickle_unpickle_ldamodel.pickle')
 23 | 
 24 |     dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]])
 25 |     doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])]
 26 |     vocab = ['word_' + str(i) for i in range(dtm.shape[1])]
 27 | 
 28 |     model = lda.LDA(2, n_iter=1)
 29 |     model.fit(dtm)
 30 | 
 31 |     model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels)
 32 | 
 33 |     unpickled = model_io.load_ldamodel_from_pickle(pfile)
 34 | 
 35 |     assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_)
 36 |     assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_)
 37 |     assert vocab == unpickled['vocab']
 38 |     assert doc_labels == unpickled['doc_labels']
 39 | 
 40 | 
 41 | @given(
 42 |     topic_word=strategy_2d_prob_distribution(),
 43 |     top_n=st.integers(min_value=0, max_value=20)
 44 | )
 45 | def test_ldamodel_top_topic_words(topic_word, top_n):
 46 |     topic_word = np.array(topic_word)
 47 | 
 48 |     vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])])
 49 | 
 50 |     if top_n < 1 or top_n > topic_word.shape[1]:
 51 |         with pytest.raises(ValueError):
 52 |             model_io.ldamodel_top_topic_words(topic_word, vocab, top_n)
 53 |     else:
 54 |         top_topic_words = model_io.ldamodel_top_topic_words(topic_word, vocab, top_n)
 55 |         colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)])
 56 |         rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(topic_word.shape[0])])
 57 | 
 58 |         assert top_topic_words.shape == (topic_word.shape[0], top_n)
 59 |         assert np.array_equal(top_topic_words.index.values, rownames)
 60 |         assert np.array_equal(top_topic_words.columns.values, colnames)
 61 | 
 62 | 
 63 | @given(
 64 |     topic_word=strategy_2d_prob_distribution(),
 65 |     top_n=st.integers(min_value=0, max_value=20)
 66 | )
 67 | def test_ldamodel_top_word_topics(topic_word, top_n):
 68 |     topic_word = np.array(topic_word)
 69 | 
 70 |     vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])])
 71 | 
 72 |     if top_n < 1 or top_n > topic_word.shape[0]:
 73 |         with pytest.raises(ValueError):
 74 |             model_io.ldamodel_top_word_topics(topic_word, vocab, top_n)
 75 |     else:
 76 |         top_word_topics = model_io.ldamodel_top_word_topics(topic_word, vocab, top_n)
 77 |         colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)])
 78 | 
 79 |         assert top_word_topics.shape == (topic_word.shape[1], top_n) == (len(vocab), top_n)
 80 |         assert np.array_equal(top_word_topics.index.values, vocab)
 81 |         assert np.array_equal(top_word_topics.columns.values, colnames)
 82 | 
 83 | 
 84 | @given(
 85 |     doc_topic=strategy_2d_prob_distribution(),
 86 |     top_n=st.integers(min_value=0, max_value=20)
 87 | )
 88 | def test_ldamodel_top_doc_topics(doc_topic, top_n):
 89 |     doc_topic = np.array(doc_topic)
 90 | 
 91 |     doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])])
 92 | 
 93 |     if top_n < 1 or top_n > doc_topic.shape[1]:
 94 |         with pytest.raises(ValueError):
 95 |             model_io.ldamodel_top_topic_words(doc_topic, doc_labels, top_n)
 96 |     else:
 97 |         top_doc_topics = model_io.ldamodel_top_doc_topics(doc_topic, doc_labels, top_n)
 98 |         colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)])
 99 | 
100 |         assert top_doc_topics.shape == (doc_topic.shape[0], top_n)
101 |         assert np.array_equal(top_doc_topics.index.values, doc_labels)
102 |         assert np.array_equal(top_doc_topics.columns.values, colnames)
103 | 
104 | 
105 | @given(
106 |     doc_topic=strategy_2d_prob_distribution(),
107 |     top_n=st.integers(min_value=0, max_value=20)
108 | )
109 | def test_ldamodel_top_topic_docs(doc_topic, top_n):
110 |     doc_topic = np.array(doc_topic)
111 | 
112 |     doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])])
113 | 
114 |     if top_n < 1 or top_n > doc_topic.shape[0]:
115 |         with pytest.raises(ValueError):
116 |             model_io.ldamodel_top_topic_docs(doc_topic, doc_labels, top_n)
117 |     else:
118 |         top_topic_docs = model_io.ldamodel_top_topic_docs(doc_topic, doc_labels, top_n)
119 |         colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)])
120 |         rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(doc_topic.shape[1])])
121 | 
122 |         assert top_topic_docs.shape == (doc_topic.shape[1], top_n)
123 |         assert np.array_equal(top_topic_docs.index.values, rownames)
124 |         assert np.array_equal(top_topic_docs.columns.values, colnames)
125 | 
126 | 
127 | @given(topic_word=strategy_2d_prob_distribution())
128 | def test_ldamodel_full_topic_words(topic_word):
129 |     topic_word = np.array(topic_word)
130 | 
131 |     vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])])
132 | 
133 |     df = model_io.ldamodel_full_topic_words(topic_word, vocab)
134 |     assert isinstance(df, pd.DataFrame)
135 | 
136 |     rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(topic_word.shape[0])])
137 |     assert df.columns.tolist() == ['_topic'] + list(vocab)
138 | 
139 |     assert np.array_equal(df.iloc[:, 0].to_numpy(), rownames)
140 | 
141 | 
142 | @given(doc_topic=strategy_2d_prob_distribution())
143 | def test_ldamodel_full_doc_topics(doc_topic):
144 |     doc_topic = np.array(doc_topic)
145 | 
146 |     doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])])
147 | 
148 |     df = model_io.ldamodel_full_doc_topics(doc_topic, doc_labels)
149 |     assert isinstance(df, pd.DataFrame)
150 | 
151 |     colnames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(doc_topic.shape[1])])
152 |     assert df.columns.tolist() == ['_doc'] + list(colnames)
153 | 
154 |     assert np.array_equal(df.iloc[:, 0].to_numpy(), doc_labels)
155 | 
156 | 
157 | @given(n_docs=st.integers(min_value=0, max_value=10),
158 |        n_topics=st.integers(min_value=0, max_value=10),
159 |        size_vocab=st.integers(min_value=0, max_value=50),
160 |        top_n_topics=st.integers(min_value=0, max_value=10),
161 |        top_n_words=st.integers(min_value=0, max_value=50),
162 |        create_dtm=st.booleans())
163 | def test_save_ldamodel_summary_to_excel(n_docs, n_topics, size_vocab, top_n_topics, top_n_words, create_dtm):
164 |     try:
165 |         import openpyxl
166 |     except ImportError:
167 |         pytest.skip('openpyxl not installed')
168 | 
169 |     topic_word = np.random.uniform(size=n_topics * size_vocab).reshape((n_topics, size_vocab))
170 |     doc_topic = np.random.uniform(size=n_docs * n_topics).reshape((n_docs, n_topics))
171 |     doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])])
172 |     vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])])
173 |     _, excelfile = tempfile.mkstemp(suffix='.xlsx')
174 | 
175 |     if create_dtm:
176 |         dtm = np.random.randint(0, 10, size=n_docs*size_vocab).reshape(n_docs, size_vocab)
177 |     else:
178 |         dtm = None
179 | 
180 |     if top_n_words < 1 or top_n_words > topic_word.shape[1] or top_n_topics < 1 or top_n_topics > topic_word.shape[0]\
181 |             or n_docs < 1:
182 |         with pytest.raises(ValueError):
183 |             model_io.save_ldamodel_summary_to_excel(excelfile, topic_word, doc_topic, doc_labels, vocab,
184 |                                                     top_n_topics=top_n_topics, top_n_words=top_n_words)
185 |     else:
186 |         excelsheets = model_io.save_ldamodel_summary_to_excel(excelfile, topic_word, doc_topic, doc_labels, vocab,
187 |                                                               top_n_topics=top_n_topics, top_n_words=top_n_words,
188 |                                                               dtm=dtm)
189 |         assert isinstance(excelsheets, OrderedDict)
190 | 
191 |         sheetnames = ['top_doc_topics_vals', 'top_doc_topics_labels', 'top_doc_topics_labelled_vals',
192 |                       'top_topic_word_vals', 'top_topic_word_labels', 'top_topic_words_labelled_vals']
193 | 
194 |         if dtm is not None:
195 |             sheetnames.append('marginal_topic_distrib')
196 | 
197 |         assert list(excelsheets.keys()) == sheetnames
198 | 
199 |         for sheetn in sheetnames:
200 |             assert isinstance(excelsheets[sheetn], pd.DataFrame)
201 | 
202 | 


--------------------------------------------------------------------------------
/tests/test_topicmod_visualize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | from hypothesis import given, strategies as st, settings
  5 | 
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | from ._testtools import strategy_2d_prob_distribution
 10 | 
 11 | from tmtoolkit.utils import empty_chararray
 12 | from tmtoolkit.topicmod import model_io, visualize
 13 | 
 14 | 
 15 | def test_generate_wordclouds_for_topic_words():
 16 |     try:
 17 |         import lda
 18 |         import PIL
 19 |         from wordcloud import WordCloud
 20 |     except ImportError:
 21 |         pytest.skip('at least one of lda, Pillow, wordcloud not installed')
 22 | 
 23 |     data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle'))
 24 |     model = data['model']
 25 |     vocab = data['vocab']
 26 | 
 27 |     phi = model.topic_word_
 28 |     assert phi.shape == (5, len(vocab))
 29 | 
 30 |     topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10)
 31 |     assert len(topic_word_clouds) == 5
 32 |     assert set(topic_word_clouds.keys()) == set('topic_%d' % i for i in range(1, 6))
 33 |     assert all(isinstance(wc, PIL.Image.Image) for wc in topic_word_clouds.values())
 34 | 
 35 |     topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10,
 36 |                                                                       which_topics=('topic_1', 'topic_2'),
 37 |                                                                       return_images=False,
 38 |                                                                       width=640, height=480)
 39 |     assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'}
 40 |     assert all(isinstance(wc, WordCloud) for wc in topic_word_clouds.values())
 41 |     assert all(wc.width == 640 and wc.height == 480 for wc in topic_word_clouds.values())
 42 | 
 43 | 
 44 | def test_generate_wordclouds_for_document_topics():
 45 |     try:
 46 |         import lda
 47 |         import PIL
 48 |         from wordcloud import WordCloud
 49 |     except ImportError:
 50 |         pytest.skip('at least one of lda, Pillow, wordcloud not installed')
 51 | 
 52 |     data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle'))
 53 |     model = data['model']
 54 |     doc_labels = data['doc_labels']
 55 | 
 56 |     theta = model.doc_topic_
 57 |     assert theta.shape == (len(doc_labels), 5)
 58 | 
 59 |     doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3)
 60 |     assert len(doc_topic_clouds) == len(doc_labels)
 61 |     assert set(doc_topic_clouds.keys()) == set(doc_labels)
 62 |     assert all(isinstance(wc, PIL.Image.Image) for wc in doc_topic_clouds.values())
 63 | 
 64 |     which_docs = doc_labels[:2]
 65 |     assert len(which_docs) == 2
 66 |     doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3,
 67 |                                                                          which_documents=which_docs,
 68 |                                                                          return_images=False,
 69 |                                                                          width=640, height=480)
 70 |     assert set(doc_topic_clouds.keys()) == set(which_docs)
 71 |     assert all(isinstance(wc, WordCloud) for wc in doc_topic_clouds.values())
 72 |     assert all(wc.width == 640 and wc.height == 480 for wc in doc_topic_clouds.values())
 73 | 
 74 | 
 75 | def test_write_wordclouds_to_folder(tmpdir):
 76 |     try:
 77 |         import lda
 78 |         import PIL
 79 |         from wordcloud import WordCloud
 80 |     except ImportError:
 81 |         pytest.skip('at least one of lda, Pillow, wordcloud not installed')
 82 | 
 83 |     path = tmpdir.mkdir('wordclouds').dirname
 84 | 
 85 |     data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle'))
 86 |     model = data['model']
 87 |     vocab = data['vocab']
 88 | 
 89 |     phi = model.topic_word_
 90 |     assert phi.shape == (5, len(vocab))
 91 | 
 92 |     topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10)
 93 | 
 94 |     visualize.write_wordclouds_to_folder(topic_word_clouds, path, 'cloud_{label}.png')
 95 | 
 96 |     for label in topic_word_clouds.keys():
 97 |         assert os.path.exists(os.path.join(path, 'cloud_{label}.png'.format(label=label)))
 98 | 
 99 | 
100 | @given(
101 |     doc_topic=strategy_2d_prob_distribution(),
102 |     make_topic_labels=st.booleans()
103 | )
104 | def test_plot_doc_topic_heatmap(doc_topic, make_topic_labels):
105 |     doc_topic = np.array(doc_topic)
106 |     doc_labels = ['d%d' % i for i in range(doc_topic.shape[0])]
107 | 
108 |     if make_topic_labels and doc_topic.ndim == 2:
109 |         topic_labels = ['t%d' % i for i in range(doc_topic.shape[1])]
110 |     else:
111 |         topic_labels = None
112 | 
113 |     fig, ax = plt.subplots(figsize=(8, 6))
114 | 
115 |     if doc_topic.ndim != 2 or 0 in set(doc_topic.shape):
116 |         with pytest.raises(ValueError):
117 |             visualize.plot_doc_topic_heatmap(fig, ax, doc_topic, doc_labels=doc_labels, topic_labels=topic_labels)
118 |     else:
119 |         visualize.plot_doc_topic_heatmap(fig, ax, doc_topic, doc_labels=doc_labels, topic_labels=topic_labels)
120 | 
121 |     plt.close(fig)
122 | 
123 | 
124 | @given(topic_word=strategy_2d_prob_distribution())
125 | def test_plot_topic_word_heatmap(topic_word):
126 |     topic_word = np.array(topic_word)
127 | 
128 |     if topic_word.ndim == 2:
129 |         vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])])
130 |     else:
131 |         vocab = empty_chararray()
132 | 
133 |     fig, ax = plt.subplots(figsize=(8, 6))
134 | 
135 |     if topic_word.ndim != 2 or 0 in set(topic_word.shape):
136 |         with pytest.raises(ValueError):
137 |             visualize.plot_topic_word_heatmap(fig, ax, topic_word, vocab)
138 |     else:
139 |         visualize.plot_topic_word_heatmap(fig, ax, topic_word, vocab)
140 | 
141 |     plt.close(fig)
142 | 
143 | 
144 | # TODO: check how eval. results are generated and reenable this
145 | # @settings(deadline=5000)
146 | # @given(n_param_sets=st.integers(0, 10),
147 | #        n_params=st.integers(1, 3),
148 | #        n_metrics=st.integers(1, 3),
149 | #        plot_specific_metric=st.booleans())
150 | # def test_plot_eval_results(n_param_sets, n_params, n_metrics, plot_specific_metric):
151 | #     param_names = ['param' + str(i) for i in range(n_params)]
152 | #     metric_names = ['metric' + str(i) for i in range(n_metrics)]
153 | #     res = []
154 | #     for _ in range(n_param_sets):
155 | #         param_set = dict(zip(param_names, np.random.randint(0, 100, n_params)))
156 | #         metric_results = dict(zip(metric_names, np.random.uniform(0, 1, n_metrics)))
157 | #         res.append((param_set, metric_results))
158 | #
159 | #     p = random.sample(param_names, random.randint(1, len(param_names)))
160 | #     by_param = evaluate.results_by_parameter(res, p)
161 | #
162 | #     if not by_param:
163 | #         with pytest.raises(ValueError):
164 | #             visualize.plot_eval_results(by_param)
165 | #     else:
166 | #         if plot_specific_metric:
167 | #             metric = random.choice(metric_names)
168 | #         else:
169 | #             metric = None
170 | #
171 | #         fig, _, _ = visualize.plot_eval_results(by_param, metric=metric, param=p)
172 | #         plt.close(fig)
173 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import os.path
  4 | import string
  5 | from datetime import date
  6 | 
  7 | import pytest
  8 | import hypothesis.strategies as st
  9 | from hypothesis import given
 10 | import numpy as np
 11 | import pandas as pd
 12 | from scipy.sparse import coo_matrix, isspmatrix_csr
 13 | 
 14 | from ._testtools import strategy_dtm_small
 15 | 
 16 | from tmtoolkit.utils import (pickle_data, unpickle_file, flatten_list, greedy_partitioning,
 17 |                              mat2d_window_from_indices, combine_sparse_matrices_columnwise, path_split, read_text_file,
 18 |                              linebreaks_win2unix, split_func_args, empty_chararray, as_chararray, merge_dicts,
 19 |                              merge_sets, sample_dict, enable_logging, set_logging_level, disable_logging, dict2df,
 20 |                              applychain)
 21 | 
 22 | PRINTABLE_ASCII_CHARS = [chr(c) for c in range(32, 127)]
 23 | 
 24 | 
 25 | @pytest.mark.parametrize('level, fmt', [
 26 |     (logging.DEBUG, '%(levelname)s:%(name)s:%(message)s'),
 27 |     (logging.INFO, '%(levelname)s:%(name)s:%(message)s'),
 28 |     (logging.WARNING, '%(levelname)s:%(name)s:%(message)s'),
 29 |     (logging.INFO, '<default>'),
 30 | ])
 31 | def test_enable_disable_logging(caplog, level, fmt):
 32 |     tmtk_logger = logging.getLogger('tmtoolkit')
 33 |     tmtk_logger.setLevel(logging.WARNING)      # reset to default level
 34 | 
 35 |     tmtk_logger.debug('test line debug 1')
 36 |     tmtk_logger.info('test line info 1')
 37 |     assert caplog.text == ''
 38 | 
 39 |     # pytest caplog fixture uses an extra logging handler (which is already added to the logger)
 40 |     if fmt == '<default>':
 41 |         enable_logging(level, logging_handler=caplog.handler, add_logging_handler=False)
 42 |     else:
 43 |         enable_logging(level, fmt, logging_handler=caplog.handler, add_logging_handler=False)
 44 | 
 45 |     tmtk_logger.debug('test line debug 2')
 46 |     if level == logging.DEBUG:
 47 |         assert caplog.text.endswith('DEBUG:tmtoolkit:test line debug 2\n')
 48 |         if fmt == '<default>':
 49 |             assert caplog.text.startswith(date.today().isoformat())
 50 |     else:
 51 |         assert caplog.text == ''
 52 | 
 53 |     caplog.clear()
 54 | 
 55 |     tmtk_logger.info('test line info 2')
 56 |     if level <= logging.INFO:
 57 |         assert caplog.text.endswith('INFO:tmtoolkit:test line info 2\n')
 58 |         if fmt == '<default>':
 59 |             assert caplog.text.startswith(date.today().isoformat())
 60 |     else:
 61 |         assert caplog.text == ''
 62 | 
 63 |     if level > logging.DEBUG:   # reduce logging level to DEBUG
 64 |         caplog.clear()
 65 |         set_logging_level(logging.DEBUG)
 66 |         tmtk_logger.debug('test line debug 3')
 67 |         assert caplog.text.endswith('DEBUG:tmtoolkit:test line debug 3\n')
 68 |         if fmt == '<default>':
 69 |             assert caplog.text.startswith(date.today().isoformat())
 70 | 
 71 |     caplog.clear()
 72 |     disable_logging()
 73 | 
 74 |     tmtk_logger.debug('test line debug 4')
 75 |     tmtk_logger.info('test line info 4')
 76 | 
 77 |     assert caplog.text == ''
 78 | 
 79 | 
 80 | def test_pickle_unpickle():
 81 |     pfile = os.path.join('tests', 'data', 'test_pickle_unpickle.pickle')
 82 |     input_data = ('foo', 123, [])
 83 |     pickle_data(input_data, pfile)
 84 | 
 85 |     output_data = unpickle_file(pfile)
 86 | 
 87 |     for i, o in zip(input_data, output_data):
 88 |         assert i == o
 89 | 
 90 | 
 91 | def test_path_split():
 92 |     assert path_split('') == []
 93 |     assert path_split('/') == []
 94 |     assert path_split('a') == ['a']
 95 |     assert path_split('/a') == ['a']
 96 |     assert path_split('/a/') == ['a']
 97 |     assert path_split('a/') == ['a']
 98 |     assert path_split('a/b') == ['a', 'b']
 99 |     assert path_split('a/b/c') == ['a', 'b', 'c']
100 |     assert path_split('/a/b/c') == ['a', 'b', 'c']
101 |     assert path_split('/a/b/c/') == ['a', 'b', 'c']
102 |     assert path_split('/a/../b/c/') == ['a', '..', 'b', 'c']
103 |     assert path_split('/a/b/c/d.txt') == ['a', 'b', 'c', 'd.txt']
104 | 
105 | 
106 | def test_read_text_file():
107 |     fpath = os.path.join('tests', 'data', 'gutenberg', 'kafka_verwandlung.txt')
108 |     contents = read_text_file(fpath, encoding='utf-8')
109 |     assert len(contents) > 0
110 |     contents = read_text_file(fpath, encoding='utf-8', read_size=10)
111 |     assert 5 <= len(contents) <= 10
112 |     contents = read_text_file(fpath, encoding='utf-8', read_size=10, force_unix_linebreaks=False)
113 |     assert len(contents) == 10
114 |     contents = read_text_file(fpath, encoding='utf-8', read_size=100)
115 |     assert 0 < len(contents) <= 100
116 | 
117 | 
118 | @given(text=st.text(alphabet=list('abc \r\n'), max_size=20))
119 | def test_linebreaks_win2unix(text):
120 |     res = linebreaks_win2unix(text)
121 |     assert '\r\n' not in res
122 |     if '\r\n' in text:
123 |         assert '\n' in res
124 | 
125 | 
126 | def test_empty_chararray():
127 |     res = empty_chararray()
128 |     assert isinstance(res, np.ndarray)
129 |     assert len(res) == 0
130 |     assert res.ndim == 1
131 |     assert np.issubdtype(res.dtype, 'str')
132 | 
133 | 
134 | @given(x=st.lists(st.integers()),
135 |        as_numpy_array=st.booleans())
136 | def test_as_chararray(x, as_numpy_array):
137 |     x_orig = x
138 |     if as_numpy_array:
139 |         x = np.array(x)
140 | 
141 |     res = as_chararray(x)
142 |     assert isinstance(res, np.ndarray)
143 |     assert len(res) == len(x)
144 |     assert res.ndim == 1
145 |     assert np.issubdtype(res.dtype, 'str')
146 |     assert res.tolist() == list(map(str, x_orig))
147 | 
148 | 
149 | @given(data=st.dictionaries(keys=st.text(string.ascii_letters, min_size=1), values=st.integers(), max_size=10),
150 |        key_name=st.text(string.ascii_letters, min_size=1),
151 |        value_name=st.text(string.ascii_letters, min_size=1),
152 |        sort=st.sampled_from([None, 'key', 'value']),
153 |        asc=st.booleans())
154 | def test_dict2df(data, key_name, value_name, sort, asc):
155 |     if sort == 'key':
156 |         sort_arg = key_name
157 |     elif sort == 'value':
158 |         sort_arg = value_name
159 |     else:
160 |         sort_arg = None
161 | 
162 |     if not asc and sort is not None:
163 |         sort_arg = '-' + sort_arg
164 | 
165 |     if key_name == value_name:
166 |         with pytest.raises(ValueError):
167 |             dict2df(data, key_name, value_name, sort=sort_arg)
168 |     else:
169 |         res = dict2df(data, key_name, value_name, sort=sort_arg)
170 |         assert isinstance(res, pd.DataFrame)
171 |         assert len(res) == len(data)
172 |         assert res.columns.tolist() == [key_name, value_name]
173 | 
174 |         # check key - value mapping
175 |         for k, v in data.items():
176 |             cell = res.loc[res[key_name] == k, value_name].tolist()
177 |             assert len(cell) == 1
178 |             assert cell[0] == v
179 | 
180 |         # check sort
181 |         if sort == 'key':
182 |             assert res[key_name].tolist() == sorted(data.keys(), reverse=not asc)
183 |         elif sort == 'value':
184 |             assert res[value_name].tolist() == sorted(data.values(), reverse=not asc)
185 |         else:
186 |             assert res[key_name].tolist() == list(data.keys())
187 |             assert res[value_name].tolist() == list(data.values())
188 | 
189 | 
190 | @pytest.mark.parametrize('expected, funcs, initial_arg', [
191 |     (None, [], 1),
192 |     (1, [lambda x: x], 1),
193 |     (1, [lambda x: -x, lambda x: -x], 1),
194 |     (2.0, [lambda x: x**2, math.sqrt], 2),
195 |     (8.0, [lambda x: x**2, math.sqrt, lambda x: x**3], 2),
196 | ])
197 | def test_applychain(expected, funcs, initial_arg):
198 |     if expected is None:
199 |         with pytest.raises(ValueError):
200 |             applychain(funcs, initial_arg)
201 |     else:
202 |         res = applychain(funcs, initial_arg)
203 |         if isinstance(expected, float):
204 |             assert math.isclose(res, expected)
205 |         else:
206 |             assert res == expected
207 | 
208 | 
209 | @given(l=st.lists(st.integers(0, 10), min_size=2, max_size=2).flatmap(
210 |     lambda size: st.lists(st.lists(st.integers(), min_size=size[0], max_size=size[0]),
211 |                           min_size=size[1], max_size=size[1])))
212 | def test_flatten_list(l):
213 |     l_ = flatten_list(l)
214 | 
215 |     assert type(l_) is list
216 |     assert len(l_) == sum(map(len, l))
217 | 
218 | 
219 | @given(
220 |     mat=strategy_dtm_small(),
221 |     n_row_indices=st.integers(0, 10),
222 |     n_col_indices=st.integers(0, 10),
223 |     copy=st.booleans()
224 | )
225 | def test_mat2d_window_from_indices(mat, n_row_indices, n_col_indices, copy):
226 |     mat = np.array(mat)
227 | 
228 |     n_rows, n_cols = mat.shape
229 | 
230 |     if n_row_indices == 0:
231 |         row_indices = None
232 |     else:
233 |         row_indices = np.random.choice(np.arange(n_rows), size=min(n_rows, n_row_indices), replace=False)
234 | 
235 |     if n_col_indices == 0:
236 |         col_indices = None
237 |     else:
238 |         col_indices = np.random.choice(np.arange(n_cols), size=min(n_cols, n_col_indices), replace=False)
239 | 
240 |     window = mat2d_window_from_indices(mat, row_indices, col_indices, copy)
241 | 
242 |     if row_indices is None:
243 |         asserted_y_shape = n_rows
244 |     else:
245 |         asserted_y_shape = len(row_indices)
246 |     assert window.shape[0] == asserted_y_shape
247 | 
248 |     if col_indices is None:
249 |         asserted_x_shape = n_cols
250 |     else:
251 |         asserted_x_shape = len(col_indices)
252 |     assert window.shape[1] == asserted_x_shape
253 | 
254 |     if row_indices is None:
255 |         row_indices_check = np.arange(n_rows)
256 |     else:
257 |         row_indices_check = row_indices
258 | 
259 |     if col_indices is None:
260 |         col_indices_check = np.arange(n_cols)
261 |     else:
262 |         col_indices_check = col_indices
263 | 
264 |     for w_y, m_y in enumerate(row_indices_check):
265 |         for w_x, m_x in enumerate(col_indices_check):
266 |             assert window[w_y, w_x] == mat[m_y, m_x]
267 | 
268 | 
269 | @given(dicts=st.lists(st.dictionaries(st.text(), st.integers())),
270 |        sort_keys=st.booleans(),
271 |        safe=st.booleans())
272 | def test_merge_dicts(dicts, sort_keys, safe):
273 |     all_keys = set()
274 |     has_common_keys = False
275 |     for d in dicts:
276 |         ks = set(d.keys())
277 |         if not has_common_keys and any(k in all_keys for k in ks):
278 |             has_common_keys = True
279 |         all_keys.update(ks)
280 | 
281 |     if len(dicts) > 1 and has_common_keys and safe:
282 |         with pytest.raises(ValueError, match=r'^merging these containers would overwrite already existing contents'):
283 |             merge_dicts(dicts, sort_keys=sort_keys, safe=safe)
284 |     else:
285 |         res = merge_dicts(dicts, sort_keys=sort_keys, safe=safe)
286 |         assert isinstance(res, dict)
287 |         n = sum(map(len, dicts))
288 |         if has_common_keys:
289 |             assert len(res) <= n
290 |         else:
291 |             assert len(res) == n
292 |             for d in dicts:
293 |                 for k, v in d.items():
294 |                     assert res[k] == v
295 |         assert set(res.keys()) == all_keys
296 |         if sort_keys:
297 |             assert list(res.keys()) == sorted(all_keys)
298 | 
299 | @given(sets=st.lists(st.sets(st.integers())), safe=st.booleans())
300 | def test_merge_sets(sets, safe):
301 |     all_elems = set()
302 |     has_common_elems = False
303 |     for s in sets:
304 |         if not has_common_elems and any(e in all_elems for e in s):
305 |             has_common_elems = True
306 |         all_elems.update(s)
307 | 
308 |     if len(sets) > 1 and has_common_elems and safe:
309 |         with pytest.raises(ValueError, match=r'^merging these containers would overwrite already existing contents'):
310 |             merge_sets(sets, safe=safe)
311 |     else:
312 |         res = merge_sets(sets, safe=safe)
313 |         assert res == all_elems
314 | 
315 | 
316 | @given(d=st.dictionaries(st.text(), st.integers()), n=st.integers())
317 | def test_sample_dict(d, n):
318 |     if 0 <= n <= len(d):
319 |         res = sample_dict(d, n=n)
320 |         assert isinstance(res, dict)
321 |         assert len(res) == n
322 |         assert set(res.keys()) <= set(d.keys())
323 | 
324 |         for k, v in res.items():
325 |             assert v == d[k]
326 |     else:
327 |         with pytest.raises(ValueError):
328 |             sample_dict(d, n=n)
329 | 
330 | 
331 | @given(elems_dict=st.dictionaries(st.text(string.printable), st.floats(allow_nan=False, allow_infinity=False)),
332 |        k=st.integers())
333 | def test_greedy_partitioning(elems_dict, k):
334 |     if k <= 0:
335 |         with pytest.raises(ValueError):
336 |             greedy_partitioning(elems_dict, k)
337 |     else:
338 |         bins = greedy_partitioning(elems_dict, k)
339 | 
340 |         if 1 < k <= len(elems_dict):
341 |             assert k == len(bins)
342 |         else:
343 |             assert len(bins) == len(elems_dict)
344 | 
345 |         if k == 1:
346 |             assert bins == elems_dict
347 |         else:
348 |             assert sum(len(b.keys()) for b in bins) == len(elems_dict)
349 |             assert all((k in elems_dict.keys() for k in b.keys()) for b in bins)
350 | 
351 |             if k > len(elems_dict):
352 |                 assert all(len(b) == 1 for b in bins)
353 | 
354 | 
355 | def test_combine_sparse_matrices_columnwise():
356 |     m1 = coo_matrix(np.array([
357 |         [1, 0, 3],
358 |         [0, 2, 0],
359 |     ]))
360 |     
361 |     cols1 = list('CAD')
362 |     rows1 = [4, 0]   # row labels. can be integers!
363 |     
364 |     m2 = coo_matrix(np.array([
365 |         [0, 0, 1, 2],
366 |         [3, 4, 5, 6],
367 |         [2, 1, 0, 0],
368 |     ]))
369 | 
370 |     cols2 = list('DBCA')
371 |     rows2 = [3, 1, 2]
372 | 
373 |     m3 = coo_matrix(np.array([
374 |         [9, 8],
375 |     ]))
376 | 
377 |     cols3 = list('BD')
378 | 
379 |     m4 = coo_matrix(np.array([
380 |         [9],
381 |         [8]
382 |     ]))
383 | 
384 |     cols4 = list('A')
385 | 
386 |     m5 = coo_matrix((0, 0), dtype=int)
387 | 
388 |     cols5 = []
389 | 
390 |     expected_1_2 = np.array([
391 |         [0, 0, 1, 3],
392 |         [2, 0, 0, 0],
393 |         [2, 0, 1, 0],
394 |         [6, 4, 5, 3],
395 |         [0, 1, 0, 2],
396 |     ])
397 | 
398 |     expected_1_5 = np.array([
399 |         [0, 0, 1, 3],
400 |         [2, 0, 0, 0],
401 |         [2, 0, 1, 0],
402 |         [6, 4, 5, 3],
403 |         [0, 1, 0, 2],
404 |         [0, 9, 0, 8],   # 3
405 |         [9, 0, 0, 0],   # 4
406 |         [8, 0, 0, 0],   # 4
407 |     ])
408 | 
409 |     expected_1_2_rows_sorted = np.array([
410 |         [2, 0, 0, 0],
411 |         [6, 4, 5, 3],
412 |         [0, 1, 0, 2],
413 |         [2, 0, 1, 0],
414 |         [0, 0, 1, 3],
415 |     ])
416 | 
417 |     with pytest.raises(ValueError):
418 |         combine_sparse_matrices_columnwise([], [])
419 | 
420 |     with pytest.raises(ValueError):
421 |         combine_sparse_matrices_columnwise((m1, m2), (cols1, ))
422 | 
423 |     with pytest.raises(ValueError):
424 |         combine_sparse_matrices_columnwise((m1, m2), (cols1, list('X')))
425 | 
426 |     with pytest.raises(ValueError):
427 |         combine_sparse_matrices_columnwise((m2, ), (cols1, cols2))
428 | 
429 |     with pytest.raises(ValueError):
430 |         combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), [])
431 | 
432 |     with pytest.raises(ValueError):
433 |         combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, rows1))
434 | 
435 |     with pytest.raises(ValueError):
436 |         combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, [0, 0, 0, 0]))
437 | 
438 |     # matrices 1 and 2, no row re-ordering
439 |     res, res_cols = combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2))
440 |     
441 |     assert isspmatrix_csr(res)
442 |     assert res.shape == (5, 4)
443 |     assert np.all(res.A == expected_1_2)
444 |     assert np.array_equal(res_cols, np.array(list('ABCD')))
445 | 
446 |     # matrices 1 and 2, re-order rows
447 |     res, res_cols, res_rows = combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, rows2))
448 |     assert isspmatrix_csr(res)
449 |     assert res.shape == (5, 4)
450 |     assert np.all(res.A == expected_1_2_rows_sorted)
451 |     assert np.array_equal(res_cols, np.array(list('ABCD')))
452 |     assert np.array_equal(res_rows, np.arange(5))
453 | 
454 |     # matrices 1 to 5, no row re-ordering
455 |     res, res_cols = combine_sparse_matrices_columnwise((m1, m2, m3, m4, m5), (cols1, cols2, cols3, cols4, cols5))
456 | 
457 |     assert isspmatrix_csr(res)
458 |     assert np.all(res.A == expected_1_5)
459 |     assert np.array_equal(res_cols, np.array(list('ABCD')))
460 | 
461 | 
462 | @pytest.mark.parametrize('testfn, testargs, expargs1, expargs2', [
463 |     (lambda x, y: ..., {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2}, {'z': 3}),
464 |     (lambda: ..., {'x': 1, 'y': 2, 'z': 3}, {}, {'x': 1, 'y': 2, 'z': 3}),
465 |     (lambda x, y, z: ..., {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2, 'z': 3}, {}),
466 | ])
467 | def test_split_func_args(testfn, testargs, expargs1, expargs2):
468 |     res = split_func_args(testfn, testargs)
469 |     assert isinstance(res, tuple) and len(res) == 2
470 |     args1, args2 = res
471 |     assert args1 == expargs1
472 |     assert args2 == expargs2
473 | 


--------------------------------------------------------------------------------
/tmtoolkit/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tmtoolkit – Text Mining and Topic Modeling Toolkit for Python
 3 | 
 4 | Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | from importlib.util import find_spec
 8 | import logging
 9 | 
10 | __title__ = 'tmtoolkit'
11 | __version__ = '0.11.2'
12 | __author__ = 'Markus Konrad'
13 | __license__ = 'Apache License 2.0'
14 | 
15 | logger = logging.getLogger(__title__)
16 | logger.addHandler(logging.NullHandler())
17 | logger.setLevel(logging.WARNING)   # set default level
18 | 
19 | 
20 | from . import bow, topicmod, tokenseq, types, utils
21 | 
22 | if not any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')):
23 |     from . import corpus
24 | 


--------------------------------------------------------------------------------
/tmtoolkit/__main__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | tmtoolkit – Text Mining and Topic Modeling Toolkit for Python
  3 | 
  4 | CLI module
  5 | 
  6 | Markus Konrad <markus.konrad@wzb.eu>
  7 | """
  8 | 
  9 | HELP_TEXT = """tmtoolkit installation setup
 10 | 
 11 | Run
 12 | 
 13 |     python -m tmtoolkit setup <LANGUAGES>
 14 | 
 15 | to install all necessary language models for languages listed as
 16 | comma-separated language ISO codes in <LANGUAGES>. The list must
 17 | be specified without spaces. Example:
 18 | 
 19 |     python -m tmtoolkit setup en,de,ru
 20 | 
 21 | This will install language models for English, German and Russian.
 22 | To install all available language models, run:
 23 | 
 24 |     python -m tmtoolkit setup all
 25 | 
 26 | You can pass two additional arguments:
 27 | 
 28 |     --variants=...  sets the model size variants to install; default is --variants=sm,md
 29 |     --no-update     if this argument is passed, only models that are not installed so far will be installed
 30 | """
 31 | 
 32 | if __name__ == '__main__':
 33 |     import sys
 34 |     import subprocess
 35 |     import json
 36 | 
 37 |     try:
 38 |         from tmtoolkit.corpus import DEFAULT_LANGUAGE_MODELS
 39 |     except ImportError:
 40 |         print('error: tmtoolkit is not installed with the dependencies required for text processing; '
 41 |               'install tmtoolkit with the [recommended] or [textproc] option', file=sys.stderr)
 42 |         exit(1)
 43 | 
 44 |     def _setup(args):
 45 |         from spacy.cli.download import download
 46 | 
 47 |         variants_switch = '--variants='
 48 |         i_variants_arg = None
 49 |         for i, arg in enumerate(args):
 50 |             if arg.startswith(variants_switch):
 51 |                 i_variants_arg = i
 52 |                 break
 53 | 
 54 |         if i_variants_arg is not None:
 55 |             vararg = args.pop(i_variants_arg)
 56 |             variants = vararg[len(variants_switch):].split(',')
 57 |         else:
 58 |             variants = ['sm', 'md']
 59 | 
 60 |         try:
 61 |             args.remove('--no-update')
 62 |             no_update = True
 63 |         except ValueError:
 64 |             no_update = False
 65 | 
 66 |         if not args:
 67 |             print('error: you must pass a list of two-letter ISO 639-1 language codes to install the respective '
 68 |                   'language models or the string "all" to install all available language models', file=sys.stderr)
 69 |             exit(3)
 70 | 
 71 |         if args == ['all']:
 72 |             install_languages = list(DEFAULT_LANGUAGE_MODELS.keys())
 73 |         else:
 74 |             install_languages = []
 75 |             for arg in args:
 76 |                 install_languages.extend([l for l in map(str.strip, arg.split(',')) if l])
 77 | 
 78 |         print('checking if required spaCy data packages are installed...')
 79 | 
 80 |         try:
 81 |             piplist_str = subprocess.check_output([sys.executable, '-m', 'pip', 'list',
 82 |                                                    '--disable-pip-version-check',
 83 |                                                    '--format', 'json'])
 84 |         except subprocess.CalledProcessError as exc:
 85 |             print('error: calling pip failed with the following error message\n' + str(exc), file=sys.stderr)
 86 |             exit(4)
 87 | 
 88 |         piplist = json.loads(piplist_str)
 89 |         installed_pkgs = set(item['name'] for item in piplist)
 90 | 
 91 |         for modelvar in variants:
 92 |             model_pkgs = dict(zip(DEFAULT_LANGUAGE_MODELS.keys(),
 93 |                                   map(lambda x: x.replace('_', '-') + '-' + modelvar,
 94 |                                       DEFAULT_LANGUAGE_MODELS.values())))
 95 | 
 96 |             for lang in install_languages:
 97 |                 if lang not in DEFAULT_LANGUAGE_MODELS.keys():
 98 |                     print(f'error: no language model for language code "{lang}"', file=sys.stderr)
 99 |                     exit(5)
100 | 
101 |                 lang_model_pkg = model_pkgs[lang]
102 | 
103 |                 if no_update and lang_model_pkg in installed_pkgs:
104 |                     print(f'language model package "{lang_model_pkg}" for language code "{lang}" is already installed '
105 |                           f'-- skipping')
106 |                     continue
107 | 
108 |                 lang_model = DEFAULT_LANGUAGE_MODELS[lang] + '_' + modelvar
109 |                 print(f'installing language model "{lang_model}" for language code "{lang}"...')
110 |                 download(lang_model)
111 | 
112 |         print('done.')
113 | 
114 |     def _help(args):
115 |         print(HELP_TEXT)
116 | 
117 |     commands = {
118 |         'setup': _setup,
119 |         'help': _help,
120 |     }
121 | 
122 |     if len(sys.argv) <= 1:
123 |         print('available commands: ' + ', '.join(commands.keys()))
124 |         print('run `python -m tmtoolkit help` for help')
125 |         exit(6)
126 | 
127 |     cmd = sys.argv[1]
128 |     if cmd in commands.keys():
129 |         commands[cmd](sys.argv[2:])
130 |     else:
131 |         print('command not supported:', cmd, file=sys.stderr)
132 |         exit(7)
133 | 


--------------------------------------------------------------------------------
/tmtoolkit/bow/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Bag-of-Words (BoW) sub-package with modules for generating document-term-matrices (DTMs) and some common statistics for
3 | the BoW model.
4 | 
5 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
6 | """
7 | 
8 | from . import bow_stats, dtm
9 | 


--------------------------------------------------------------------------------
/tmtoolkit/bow/dtm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for creating a document-term matrix (DTM) and some compatibility functions for Gensim.
  3 | 
  4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
  5 | """
  6 | 
  7 | import numpy as np
  8 | from scipy.sparse import coo_matrix, issparse
  9 | 
 10 | import pandas as pd
 11 | 
 12 | 
 13 | #%% DTM creation
 14 | 
 15 | def create_sparse_dtm(vocab, docs, n_unique_tokens, vocab_is_sorted=False, dtype=None):
 16 |     """
 17 |     Create a sparse document-term-matrix (DTM) as matrix in
 18 |     `COO sparse format <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix>`_
 19 |     from vocabulary array `vocab`, a list of tokenized documents `docs` and the number of unique tokens across all
 20 |     documents `n_unique_tokens`.
 21 | 
 22 |     The DTM's rows are document names, its columns are indices in `vocab`, hence a value ``DTM[j, k]`` is the
 23 |     term frequency of term ``vocab[k]`` in document ``j``.
 24 | 
 25 |     A note on performance: Creating the three arrays for a COO matrix seems to be the fastest way to generate a DTM.
 26 |     An alternative implementation using LIL format was ~2x slower.
 27 | 
 28 |     Memory requirement: about ``3 * <n_unique_tokens> * 4`` bytes with default dtype (32-bit integer).
 29 | 
 30 |     .. seealso:: This is the "low level" function. For the straight-forward to use function see
 31 |                  :func:`tmtoolkit.corpus.dtm`, which also calculates `n_unique_tokens`.
 32 | 
 33 |     :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm`
 34 |     :param docs: a list of tokenized documents
 35 |     :param n_unique_tokens: number of unique tokens across all documents
 36 |     :param vocab_is_sorted: if True, assume that `vocab` is sorted when creating the token IDs
 37 |     :param dtype: data type of the resulting matrix
 38 |     :return: a sparse document-term-matrix in COO sparse format
 39 |     """
 40 | 
 41 |     if vocab_is_sorted:
 42 |         vocab_sorter = None
 43 |     else:
 44 |         vocab_sorter = np.argsort(vocab)  # indices that sort <vocab>
 45 | 
 46 |     nvocab = len(vocab)
 47 |     ndocs = len(docs)
 48 | 
 49 |     # create arrays for sparse matrix
 50 |     dtype = dtype or 'int32'
 51 |     data = np.empty(n_unique_tokens, dtype=dtype)  # all non-zero term frequencies at data[k]
 52 |     cols = np.empty(n_unique_tokens, dtype=dtype)  # column index for kth data item (kth term freq.)
 53 |     rows = np.empty(n_unique_tokens, dtype=dtype)  # row index for kth data item (kth term freq.)
 54 | 
 55 |     ind = 0  # current index in the sparse matrix data
 56 |     # go through all documents with their terms
 57 |     for doc_idx, terms in enumerate(docs):
 58 |         if len(terms) == 0: continue   # skip empty documents
 59 | 
 60 |         # find indices into `vocab` such that, if the corresponding elements in `terms` were
 61 |         # inserted before the indices, the order of `vocab` would be preserved
 62 |         # -> array of indices of `terms` in `vocab`
 63 |         if vocab_is_sorted:
 64 |             term_indices = np.searchsorted(vocab, terms)
 65 |         else:
 66 |             term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]
 67 | 
 68 |         # count the unique terms of the document and get their vocabulary indices
 69 |         uniq_indices, counts = np.unique(term_indices, return_counts=True)
 70 |         n_vals = len(uniq_indices)
 71 |         ind_end = ind + n_vals
 72 | 
 73 |         data[ind:ind_end] = counts  # save the counts (term frequencies)
 74 |         cols[ind:ind_end] = uniq_indices  # save the column index: index in <vocab>
 75 |         rows[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value
 76 | 
 77 |         ind = ind_end
 78 | 
 79 |     assert ind == len(data)
 80 | 
 81 |     return coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=dtype)
 82 | 
 83 | 
 84 | def dtm_to_dataframe(dtm, doc_labels, vocab):
 85 |     """
 86 |     Convert a (sparse) DTM to a pandas DataFrame using document labels `doc_labels` as row index and `vocab` as column
 87 |     names.
 88 | 
 89 |     :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts
 90 |     :param doc_labels: document labels used as row index (row names); size must equal number of rows in `dtm`
 91 |     :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm`
 92 |     :return: pandas DataFrame
 93 |     """
 94 |     if dtm.ndim != 2:
 95 |         raise ValueError('`dtm` must be a 2D array/matrix')
 96 | 
 97 |     if dtm.shape[0] != len(doc_labels):
 98 |         raise ValueError('number of rows must be equal to `len(doc_labels)')
 99 | 
100 |     if dtm.shape[1] != len(vocab):
101 |         raise ValueError('number of rows must be equal to `len(vocab)')
102 | 
103 |     if not isinstance(dtm, np.ndarray):
104 |         dtm = dtm.toarray()
105 | 
106 |     return pd.DataFrame(dtm, index=doc_labels, columns=vocab)
107 | 
108 | 
109 | #%% Gensim compatibility functions
110 | 
111 | 
112 | def dtm_to_gensim_corpus(dtm):
113 |     """
114 |     Convert a (sparse) DTM to a Gensim Corpus object.
115 | 
116 |     .. seealso:: :func:`~tmtoolkit.bow.dtm.gensim_corpus_to_dtm` for the inverse function or
117 |                  :func:`~tmtoolkit.bow.dtm.dtm_and_vocab_to_gensim_corpus_and_dict` which additionally creates a Gensim
118 |                  :class:`~gensim.corpora.dictionary.Dictionary`.
119 | 
120 |     :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts
121 |     :return: a Gensim :class:`gensim.matutils.Sparse2Corpus` object
122 |     """
123 |     import gensim
124 | 
125 |     # DTM with documents to words sparse matrix in COO format has to be converted to transposed sparse matrix in CSC
126 |     # format
127 |     dtm_t = dtm.transpose()
128 | 
129 |     if issparse(dtm_t):
130 |         if dtm_t.format != 'csc':
131 |             dtm_sparse = dtm_t.tocsc()
132 |         else:
133 |             dtm_sparse = dtm_t
134 |     else:
135 |         from scipy.sparse import csc_matrix
136 |         dtm_sparse = csc_matrix(dtm_t)
137 | 
138 |     return gensim.matutils.Sparse2Corpus(dtm_sparse)
139 | 
140 | 
141 | def gensim_corpus_to_dtm(corpus):
142 |     """
143 |     Convert a Gensim corpus object to a sparse DTM in COO format.
144 | 
145 |     .. seealso:: :func:`~tmtoolkit.bow.dtm.dtm_to_gensim_corpus` for the inverse function.
146 | 
147 |     :param corpus: Gensim corpus object
148 |     :return: sparse DTM in COO format
149 |     """
150 |     import gensim
151 |     from scipy.sparse import coo_matrix
152 | 
153 |     dtm_t = gensim.matutils.corpus2csc(corpus)
154 |     return coo_matrix(dtm_t.transpose())
155 | 
156 | 
157 | def dtm_and_vocab_to_gensim_corpus_and_dict(dtm, vocab, as_gensim_dictionary=True):
158 |     """
159 |     Convert a (sparse) DTM *and* a vocabulary list to a Gensim Corpus object and
160 |     Gensim :class:`~gensim.corpora.dictionary.Dictionary` object or a Python :func:`dict`.
161 | 
162 |     :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts
163 |     :param vocab: list or array of vocabulary
164 |     :param as_gensim_dictionary: if True create Gensim :class:`~gensim.corpora.dictionary.Dictionary` from `vocab`,
165 |                                  else create Python :func:`dict`
166 |     :return: a 2-tuple with (Corpus object, Gensim :class:`~gensim.corpora.dictionary.Dictionary` or
167 |              Python :func:`dict`)
168 |     """
169 |     corpus = dtm_to_gensim_corpus(dtm)
170 | 
171 |     # vocabulary array has to be converted to dict with index -> word mapping
172 |     id2word = dict(zip(range(len(vocab)), vocab))
173 | 
174 |     if as_gensim_dictionary:
175 |         import gensim
176 |         return corpus, gensim.corpora.dictionary.Dictionary().from_corpus(corpus, id2word)
177 |     else:
178 |         return corpus, id2word
179 | 


--------------------------------------------------------------------------------
/tmtoolkit/corpus/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for processing text as token sequences in labelled documents. A set of documents is represented as *corpus*
 3 | using the :class:`Corpus` class. This sub-package also provides functions that work with a :class:`Corpus` object.
 4 | 
 5 | Text parsing and processing relies on the `SpaCy library <https://spacy.io/>`_ which must be installed when using this
 6 | sub-package.
 7 | 
 8 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 9 | """
10 | 
11 | from importlib.util import find_spec
12 | 
13 | for pkg in ('spacy', 'bidict', 'loky'):
14 |     if find_spec(pkg) is None:
15 |         raise RuntimeError(f'the required package "{pkg}" for text processing is not installed; did you install '
16 |                            f'tmtoolkit with "recommended" or "textproc" option? see '
17 |                            f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information')
18 | 
19 | from ..tokenseq import strip_tags, numbertoken_to_magnitude, simplify_unicode_chars
20 | 
21 | from ._common import DEFAULT_LANGUAGE_MODELS, LANGUAGE_LABELS, simplified_pos
22 | from ._document import Document, document_token_attr, document_from_attrs
23 | from ._corpus import Corpus
24 | 
25 | from ._corpusfuncs import (
26 |     doc_tokens, set_token_attr, set_document_attr, vocabulary, dtm, doc_texts, doc_labels, doc_lengths,
27 |     corpus_num_tokens, vocabulary_size, tokens_table, print_summary, vocabulary_counts,
28 |     doc_frequencies, doc_vectors, token_vectors, ngrams, to_lowercase, to_uppercase, remove_chars,
29 |     serialize_corpus, deserialize_corpus, save_corpus_to_picklefile, load_corpus_from_picklefile,
30 |     load_corpus_from_tokens, load_corpus_from_tokens_table, spacydocs,
31 |     lemmatize, remove_punctuation, normalize_unicode, simplify_unicode, doc_token_lengths, filter_clean_tokens,
32 |     corpus_ngramify, filter_tokens_by_mask, remove_tokens_by_mask, filter_tokens, remove_tokens,
33 |     filter_documents, remove_documents, filter_documents_by_mask, remove_documents_by_mask,
34 |     filter_documents_by_docattr, remove_documents_by_docattr, kwic, kwic_table, transform_tokens,
35 |     corpus_summary, corpus_num_chars, filter_tokens_with_kwic, filter_documents_by_label,
36 |     remove_documents_by_label, filter_for_pos, filter_tokens_by_doc_frequency, remove_common_tokens,
37 |     remove_uncommon_tokens, filter_documents_by_length, remove_documents_by_length,
38 |     join_collocations_by_patterns, join_collocations_by_statistic, corpus_tokens_flattened, corpus_collocations,
39 |     remove_token_attr, remove_document_attr, builtin_corpora_info, corpus_add_files, corpus_add_folder,
40 |     corpus_add_tabular, corpus_add_zip, corpus_sample, corpus_split_by_token, doc_num_sents, doc_sent_lengths,
41 |     numbers_to_magnitudes, corpus_split_by_paragraph, doc_labels_sample, corpus_retokenize, corpus_unique_chars,
42 |     corpus_join_documents, find_documents
43 | )
44 | 
45 | if find_spec('nltk') is not None:  # when NLTK is installed
46 |     from ._nltk_extras import stem
47 | 
48 | from . import visualize
49 | 


--------------------------------------------------------------------------------
/tmtoolkit/corpus/_common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Internal module with common functions and constants for text processing in the :mod:`tmtoolkit.corpus` module.
  3 | 
  4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
  5 | """
  6 | 
  7 | import os
  8 | from typing import Tuple, Dict
  9 | 
 10 | MODULE_PATH = os.path.dirname(os.path.abspath(__file__))
 11 | DATAPATH = os.path.normpath(os.path.join(MODULE_PATH, '..', 'data'))
 12 | 
 13 | #: Default SpaCy language models used for a given two-letter ISO 639-1 language code.
 14 | #: These model names will be appended with model size suffix like "_sm", "_md" or "_lg".
 15 | DEFAULT_LANGUAGE_MODELS = {
 16 |     'en': 'en_core_web',
 17 |     'de': 'de_core_news',
 18 |     'fr': 'fr_core_news',
 19 |     'es': 'es_core_news',
 20 |     'pt': 'pt_core_news',
 21 |     'it': 'it_core_news',
 22 |     'nl': 'nl_core_news',
 23 |     'el': 'el_core_news',
 24 |     'nb': 'nb_core_news',
 25 |     'lt': 'lt_core_news',
 26 |     'zh': 'zh_core_web',
 27 |     'ja': 'ja_core_news',
 28 |     'ca': 'ca_core_news',
 29 |     'da': 'da_core_news',
 30 |     'mk': 'mk_core_news',
 31 |     'pl': 'pl_core_news',
 32 |     'ro': 'ro_core_news',
 33 |     'ru': 'ru_core_news',
 34 | }
 35 | 
 36 | #: Map two-letter ISO 639-1 language code to language name.
 37 | LANGUAGE_LABELS = {
 38 |     'en': 'english',
 39 |     'de': 'german',
 40 |     'fr': 'french',
 41 |     'es': 'spanish',
 42 |     'pt': 'portuguese',
 43 |     'it': 'italian',
 44 |     'nl': 'dutch',
 45 |     'el': 'greek',
 46 |     'nb': 'norwegian-bokmal',
 47 |     'lt': 'lithuanian',
 48 |     'zh': 'chinese',
 49 |     'ja': 'japanese',
 50 |     'ca': 'catalan',
 51 |     'da': 'danish',
 52 |     'mk': 'macedonian',
 53 |     'pl': 'polish',
 54 |     'ro': 'romanian',
 55 |     'ru': 'russian',
 56 | }
 57 | 
 58 | BOOLEAN_SPACY_TOKEN_ATTRS = (
 59 |     'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title',
 60 |     'is_punct', 'is_left_punct', 'is_right_punct', 'is_space', 'is_bracket',
 61 |     'is_quote', 'is_currency', 'is_stop', 'like_url', 'like_num', 'like_email',
 62 | )
 63 | 
 64 | # SpaCy token attributes per pipeline component
 65 | SPACY_TOKEN_ATTRS = {   # type: Dict[str, Tuple[str]]
 66 |     '_default': BOOLEAN_SPACY_TOKEN_ATTRS + ('shape', 'sentiment', 'rank', 'cluster'),  # always enabled
 67 |     'tagger': ('tag', 'pos'),
 68 |     'morphologizer': ('pos', ),
 69 |     'parser': ('dep', ),
 70 |     'lemmatizer': ('lemma', ),
 71 |     'ner': ('ent_type', 'ent_iob'),
 72 | }
 73 | 
 74 | STD_TOKEN_ATTRS = {'is_punct', 'is_stop', 'like_num', 'tag', 'pos', 'lemma', 'ent_type'}
 75 | 
 76 | # all token attributes that can be encoded in a uint64 matrix
 77 | TOKENMAT_ATTRS = set([a for attrs in SPACY_TOKEN_ATTRS.values() for a in attrs]) \
 78 |                  | {'whitespace', 'token', 'sent_start'}
 79 | 
 80 | 
 81 | def simplified_pos(pos: str, tagset: str = 'ud', default: str = '') -> str:
 82 |     """
 83 |     Return a simplified POS tag for a full POS tag `pos` belonging to a tagset `tagset`.
 84 | 
 85 |     Does the following conversion by default:
 86 | 
 87 |     - all N... (noun) tags to 'N'
 88 |     - all V... (verb) tags to 'V'
 89 |     - all ADJ... (adjective) tags to 'ADJ'
 90 |     - all ADV... (adverb) tags to 'ADV'
 91 |     - all other to `default`
 92 | 
 93 |     Does the following conversion by with ``tagset=='penn'``:
 94 | 
 95 |     - all N... (noun) tags to 'N'
 96 |     - all V... (verb) tags to 'V'
 97 |     - all JJ... (adjective) tags to 'ADJ'
 98 |     - all RB... (adverb) tags to 'ADV'
 99 |     - all other to `default`
100 | 
101 |     Does the following conversion by with ``tagset=='ud'``:
102 | 
103 |     - all N... (noun) tags to 'N'
104 |     - all V... (verb) tags to 'V'
105 |     - all JJ... (adjective) tags to 'ADJ'
106 |     - all RB... (adverb) tags to 'ADV'
107 |     - all other to `default`
108 | 
109 |     :param pos: a POS tag as string
110 |     :param tagset: tagset used for `pos`; can be ``'wn'`` (WordNet), ``'penn'`` (Penn tagset)
111 |                    or ``'ud'`` (universal dependencies – default)
112 |     :param default: default return value when tag could not be simplified
113 |     :return: simplified tag string
114 |     """
115 | 
116 |     if pos and not isinstance(pos, str):
117 |         raise ValueError('`pos` must be a string or None')
118 | 
119 |     if tagset == 'ud':
120 |         if pos in ('NOUN', 'PROPN'):
121 |             return 'N'
122 |         elif pos == 'VERB':
123 |             return 'V'
124 |         elif pos in ('ADJ', 'ADV'):
125 |             return pos
126 |         else:
127 |             return default
128 |     elif tagset == 'penn':
129 |         if pos.startswith('N') or pos.startswith('V'):
130 |             return pos[0]
131 |         elif pos.startswith('JJ'):
132 |             return 'ADJ'
133 |         elif pos.startswith('RB'):
134 |             return 'ADV'
135 |         else:
136 |             return default
137 |     elif tagset == 'wn':
138 |         if pos.startswith('N') or pos.startswith('V'):
139 |             return pos[0]
140 |         elif pos.startswith('ADJ') or pos.startswith('ADV'):
141 |             return pos[:3]
142 |         else:
143 |             return default
144 |     else:
145 |         raise ValueError('unknown tagset "%s"' % tagset)
146 | 


--------------------------------------------------------------------------------
/tmtoolkit/corpus/_nltk_extras.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Internal module with some additional functions that are only available when the `NLTK <https://www.nltk.org/>`_ package
 3 | is installed.
 4 | 
 5 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 6 | """
 7 | from typing import Optional
 8 | 
 9 | from ._corpus import Corpus
10 | from ._common import LANGUAGE_LABELS
11 | from ._corpusfuncs import transform_tokens
12 | 
13 | 
14 | def stem(docs: Corpus, /, language: Optional[str] = None,
15 |          stemmer_instance: Optional[object] = None, inplace=True):
16 |     """
17 |     Apply stemming to all tokens in `docs` using a stemmer `stemmer_instance`.
18 | 
19 |     .. note: This requires that the `NLTK <https://www.nltk.org/>`_ package is installed.
20 | 
21 |     :param docs: a Corpus object
22 |     :param language: language in which `docs` is given; if None, will be detected from the ``language`` property of
23 |                      `docs`; note that this is not an ISO language code but a language
24 |                      label like "english" or "german" that NLTK accepts
25 |     :param stemmer_instance: a stemmer instance; it must implement a method `stem` that accepts a single string;
26 |                              default is :class:`nltk.stem.SnowballStemmer`
27 |     :param inplace: if True, modify Corpus object in place, otherwise return a modified copy
28 |     :return: either original Corpus object `docs` or a modified copy of it
29 |     """
30 | 
31 |     from nltk.stem import SnowballStemmer
32 | 
33 |     if stemmer_instance is None:
34 |         if language is None:
35 |             language = LANGUAGE_LABELS[docs.language]
36 |         stemmer_instance = SnowballStemmer(language)
37 | 
38 |     return transform_tokens(docs, stemmer_instance.stem, inplace=inplace)
39 | 


--------------------------------------------------------------------------------
/tmtoolkit/data/de/parlspeech-v2-sample-bundestag.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/de/parlspeech-v2-sample-bundestag.zip


--------------------------------------------------------------------------------
/tmtoolkit/data/en/News100.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/News100.zip


--------------------------------------------------------------------------------
/tmtoolkit/data/en/NewsArticles.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/NewsArticles.zip


--------------------------------------------------------------------------------
/tmtoolkit/data/en/parlspeech-v2-sample-houseofcommons.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/parlspeech-v2-sample-houseofcommons.zip


--------------------------------------------------------------------------------
/tmtoolkit/data/es/parlspeech-v2-sample-congreso.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/es/parlspeech-v2-sample-congreso.zip


--------------------------------------------------------------------------------
/tmtoolkit/data/nl/parlspeech-v2-sample-tweedekamer.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/nl/parlspeech-v2-sample-tweedekamer.zip


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Topic modeling sub-package with modules for model evaluation, model I/O, model statistics, parallel computation and
 3 | visualization.
 4 | 
 5 | Functions and classes in :mod:`~tmtoolkit.topicmod.tm_gensim`, :mod:`~tmtoolkit.topicmod.tm_lda` and
 6 | :mod:`~tmtoolkit.topicmod.tm_sklearn` implement parallel model computation and evaluation using popular topic modeling
 7 | packages. You need to install the respective packages (*lda*, *scikit-learn* or *gensim*) in order to use them.
 8 | 
 9 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
10 | """
11 | 
12 | 
13 | import importlib.util
14 | 
15 | from . import evaluate, model_io, model_stats, parallel, visualize
16 | 
17 | # conditional imports
18 | 
19 | # lda package
20 | if importlib.util.find_spec('lda'):
21 |     from . import tm_lda
22 | 
23 | # sklearn package
24 | if importlib.util.find_spec('sklearn'):
25 |     from . import tm_sklearn
26 | 
27 | # gensim package
28 | if importlib.util.find_spec('gensim'):
29 |     from . import tm_gensim
30 | 


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/_common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Common constants and functions for topic modeling sub-package.
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | 
 8 | DEFAULT_TOPIC_NAME_FMT = 'topic_{i1}'
 9 | DEFAULT_RANK_NAME_FMT = 'rank_{i1}'
10 | DEFAULT_VALUE_FORMAT = '{lbl} ({val:.4})'
11 | 


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/_eval_tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Common utility functions for LDA model evaluation.
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | import numpy as np
 8 | from scipy.sparse import issparse
 9 | 
10 | 
11 | def split_dtm_for_cross_validation(dtm, n_folds, shuffle_docs=True):
12 |     """
13 |     Split a (sparse) document-term matrix `dtm` for n-fold cross validation with `n_folds` folds.
14 | 
15 |     :param dtm: (sparse) document-term matrix
16 |     :param n_folds: number of folds during cross validation
17 |     :param shuffle_docs: shuffle documents (matrix rows) before splitting
18 |     :return: a generator for `n_folds` folds, each yielding a 3-tuple with (fold index starting at zero, training DTM,
19 |              test DTM)
20 |     """
21 |     if issparse(dtm) and dtm.format != 'csr':
22 |         dtm = dtm.tocsr()
23 | 
24 |     n_docs = dtm.shape[0]
25 | 
26 |     if n_folds < 2:
27 |         raise ValueError('`n_folds` must be at least 2')
28 | 
29 |     if n_docs < n_folds:
30 |         raise ValueError('not enough documents in `dtm` (must be >= `n_folds`)')
31 | 
32 |     rand_doc_ind = np.arange(n_docs)
33 | 
34 |     if shuffle_docs:
35 |         np.random.shuffle(rand_doc_ind)
36 | 
37 |     n_per_fold = n_docs // n_folds
38 |     assert n_per_fold > 0
39 |     start_idx = 0
40 |     for fold in range(n_folds):
41 |         end_idx = start_idx + n_per_fold
42 |         fold_doc_ind = rand_doc_ind[slice(start_idx, end_idx)]
43 |         test_dtm = dtm[fold_doc_ind, :]
44 | 
45 |         if issparse(dtm):
46 |             inv_fold_doc_ind = np.ones(n_docs, bool)
47 |             inv_fold_doc_ind[fold_doc_ind] = 0
48 |             train_dtm = dtm[inv_fold_doc_ind, :]
49 |         else:
50 |             train_dtm = np.delete(dtm, fold_doc_ind, axis=0)   # can't be used with sparse matrices
51 | 
52 |         assert test_dtm.shape[0] + train_dtm.shape[0] == dtm.shape[0]
53 | 
54 |         yield fold, train_dtm, test_dtm
55 | 
56 |         start_idx = end_idx
57 | 
58 | 
59 | class FakedGensimDict:
60 |     """
61 |     A class that resembles a Gensim :class:`~gensim.corpora.dictionary.Dictionary`.
62 |     """
63 |     def __init__(self, data):
64 |         if not isinstance(data, dict):
65 |             raise ValueError('`data` must be an instance of `dict`')
66 | 
67 |         self.id2token = data
68 |         self.token2id = {v: k for k, v in data.items()}
69 | 
70 |     @staticmethod
71 |     def from_vocab(vocab):
72 |         return FakedGensimDict(dict(zip(range(len(vocab)), vocab)))
73 | 
74 |     def __iter__(self):
75 |         """Iterate over all ids."""
76 |         return iter(self.keys())
77 | 
78 |     def keys(self):
79 |         """Get all stored ids."""
80 |         return self.id2token.keys()
81 | 


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/tm_gensim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parallel model computation and evaluation using the `Gensim package <https://radimrehurek.com/gensim/>`_.
  3 | 
  4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_gensim.AVAILABLE_METRICS`.
  5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics.
  6 | """
  7 | 
  8 | import logging
  9 | 
 10 | import numpy as np
 11 | 
 12 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \
 13 |     MultiprocEvaluationWorkerABC
 14 | from tmtoolkit.bow.dtm import dtm_to_gensim_corpus, gensim_corpus_to_dtm
 15 | from .evaluate import metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, metric_coherence_gensim
 16 | 
 17 | #: Available metrics for Gensim.
 18 | AVAILABLE_METRICS = (
 19 |     'perplexity',
 20 |     'cao_juan_2009',
 21 |     'arun_2010',
 22 |     'coherence_mimno_2011',
 23 |     'coherence_gensim_u_mass',     # same as coherence_mimno_2011
 24 |     'coherence_gensim_c_v',
 25 |     'coherence_gensim_c_uci',
 26 |     'coherence_gensim_c_npmi',
 27 | )
 28 | 
 29 | #: Metrics used by default.
 30 | DEFAULT_METRICS = (
 31 |     'perplexity',
 32 |     'cao_juan_2009',
 33 |     'coherence_mimno_2011',
 34 |     'coherence_gensim_c_v'
 35 | )
 36 | 
 37 | 
 38 | logger = logging.getLogger('tmtoolkit')
 39 | 
 40 | 
 41 | #%% Specialized classes for parallel processing
 42 | 
 43 | 
 44 | class MultiprocModelsWorkerGensim(MultiprocModelsWorkerABC):
 45 |     """
 46 |     Specialized parallel model computations worker for Gensim.
 47 |     """
 48 | 
 49 |     package_name = 'gensim'
 50 | 
 51 |     def fit_model(self, data, params, return_data=False):
 52 |         """
 53 |         Fit model to `data` using gensim with parameter set `params`.
 54 |         """
 55 |         from gensim.models.ldamodel import LdaModel
 56 | 
 57 |         dictionary = params.pop('dictionary', None)
 58 | 
 59 |         if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr(data, 'transpose'):
 60 |             corpus = dtm_to_gensim_corpus(data)
 61 |             dtm = data
 62 |         else:
 63 |             if isinstance(data, tuple) and len(data) == 2:
 64 |                 dictionary, corpus = data
 65 |             else:
 66 |                 corpus = data
 67 |             dtm = gensim_corpus_to_dtm(corpus)
 68 | 
 69 |         model = LdaModel(corpus, id2word=dictionary, **params)
 70 | 
 71 |         if return_data:
 72 |             return model, (corpus, dtm)
 73 |         else:
 74 |             return model
 75 | 
 76 | 
 77 | class MultiprocEvaluationWorkerGensim(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerGensim):
 78 |     """
 79 |     Specialized parallel model evaluations worker for Gensim.
 80 |     """
 81 | 
 82 |     def fit_model(self, data, params, return_data=False):
 83 |         model, (corpus, dtm) = super(MultiprocEvaluationWorkerGensim, self).fit_model(data, params, return_data=True)
 84 | 
 85 |         results = {}
 86 |         if self.return_models:
 87 |             results['model'] = model
 88 | 
 89 |         for metric in self.eval_metric:
 90 |             if metric == 'cao_juan_2009':
 91 |                 res = metric_cao_juan_2009(model.state.get_lambda())
 92 |             elif metric == 'arun_2010':
 93 |                 doc_topic_list = []
 94 |                 for doc_topic in model.get_document_topics(corpus):
 95 |                     d = dict(doc_topic)
 96 |                     # Gensim will not output near-zero prob. topics, hence the "d.get()":
 97 |                     t = tuple(d.get(ind, 0.) for ind in range(model.num_topics))
 98 |                     doc_topic_list.append(t)
 99 | 
100 |                 doc_topic_distrib = np.array(doc_topic_list)
101 |                 assert doc_topic_distrib.shape == (dtm.shape[0], params['num_topics'])
102 | 
103 |                 res = metric_arun_2010(model.state.get_lambda(), doc_topic_distrib, dtm.sum(axis=1))
104 |             elif metric == 'coherence_mimno_2011':
105 |                 topic_word = model.state.get_lambda()
106 |                 default_top_n = min(20, topic_word.shape[1])
107 |                 res = metric_coherence_mimno_2011(topic_word, dtm,
108 |                                                   top_n=self.eval_metric_options.get(
109 |                                                       'coherence_mimno_2011_top_n', default_top_n),
110 |                                                   eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1),
111 |                                                   include_prob=self.eval_metric_options.get(
112 |                                                       'coherence_mimno_2011_include_prob', False),
113 |                                                   normalize=self.eval_metric_options.get(
114 |                                                       'coherence_mimno_2011_normalize', False),
115 |                                                   return_mean=True)
116 |             elif metric.startswith('coherence_gensim_'):
117 |                 coh_measure = metric[len('coherence_gensim_'):]
118 |                 topic_word = model.state.get_lambda()
119 |                 default_top_n = min(20, topic_word.shape[1])
120 |                 metric_kwargs = {
121 |                     'measure': coh_measure,
122 |                     'gensim_model': model,
123 |                     'gensim_corpus': corpus,
124 |                     'return_mean': True,
125 |                     'processes': 1,
126 |                     'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n),
127 |                 }
128 | 
129 |                 if coh_measure != 'u_mass':
130 |                     if 'coherence_gensim_texts' not in self.eval_metric_options:
131 |                         raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other '
132 |                                          'coherence measure than `u_mass`')
133 |                     metric_kwargs.update({
134 |                         'texts': self.eval_metric_options['coherence_gensim_texts']
135 |                     })
136 | 
137 |                 metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {}))
138 | 
139 |                 res = metric_coherence_gensim(**metric_kwargs)
140 |             elif metric == 'perplexity':
141 |                 res = _get_model_perplexity(model, corpus)
142 |             else:
143 |                 raise ValueError('metric not available: "%s"' % metric)
144 | 
145 |             logger.info('> evaluation result with metric "%s": %f' % (metric, res))
146 |             results[metric] = res
147 | 
148 |         return results
149 | 
150 | 
151 | #%% main API functions for parallel processing
152 | 
153 | 
154 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None):
155 |     """
156 |     Compute several topic models in parallel using the "gensim" package. Use a single or multiple document term matrices
157 |     `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters`
158 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
159 |     is passed.
160 | 
161 |     `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID ->
162 |     Document-Term-Matrix mapping when calculating models for multiple corpora.
163 | 
164 |     If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise
165 |     it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where
166 |     `parameter_set` is a dict of the used parameters.
167 | 
168 |     :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices
169 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
170 |                                computation
171 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
172 |     :param n_max_processes: maximum number of worker processes to spawn
173 |     :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is
174 |              a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset
175 |     """
176 |     mp_models = MultiprocModelsRunner(MultiprocModelsWorkerGensim, data, varying_parameters, constant_parameters,
177 |                                       n_max_processes=n_max_processes)
178 | 
179 |     return mp_models.run()
180 | 
181 | 
182 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False,
183 |                           metric=None, **metric_kwargs):
184 |     """
185 |     Compute several Topic Models in parallel using the "gensim" package. Calculate the models using a list of varying
186 |     parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters`
187 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
188 |     is passed.
189 | 
190 |     `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix).
191 | 
192 |     Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where
193 |     `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results:
194 | 
195 |     .. code-block:: text
196 | 
197 |         [(parameter_set_1, {'<metric_name>': result_1, ...}),
198 |          ...,
199 |          (parameter_set_n, {'<metric_name>': result_n, ...})])
200 | 
201 |     .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`.
202 | 
203 |     :param data: a (sparse) 2D array/matrix
204 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
205 |                                evaluation
206 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
207 |     :param n_max_processes: maximum number of worker processes to spawn
208 |     :param return_models: if True, also return the computed models in the evaluation results
209 |     :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of
210 |                    `available_metrics`
211 |     :param metric_kwargs: dict of options for metric used metric(s)
212 |     :return: list of evaluation results for each varying parameter set as described above
213 |     """
214 |     mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerGensim, AVAILABLE_METRICS, data,
215 |                                         varying_parameters, constant_parameters,
216 |                                         metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs,
217 |                                         n_max_processes=n_max_processes, return_models=return_models)
218 | 
219 |     return mp_eval.run()
220 | 
221 | 
222 | #%% Helper functions
223 | 
224 | 
225 | def _get_model_perplexity(model, eval_corpus):
226 |     n_words = sum(cnt for document in eval_corpus for _, cnt in document)
227 |     bound = model.bound(eval_corpus)
228 |     perwordbound = bound / n_words
229 | 
230 |     return np.exp2(-perwordbound)
231 | 


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/tm_lda.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parallel model computation and evaluation using the `lda package <https://github.com/lda-project/lda>`_.
  3 | 
  4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_lda.AVAILABLE_METRICS`.
  5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics.
  6 | """
  7 | 
  8 | import logging
  9 | import importlib.util
 10 | 
 11 | import numpy as np
 12 | 
 13 | from ._eval_tools import split_dtm_for_cross_validation
 14 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \
 15 |     MultiprocEvaluationWorkerABC
 16 | from .evaluate import metric_griffiths_2004, metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, \
 17 |     metric_coherence_gensim, metric_held_out_documents_wallach09
 18 | 
 19 | if importlib.util.find_spec('gmpy2'):
 20 |     metrics_using_gmpy2 = ('griffiths_2004', 'held_out_documents_wallach09')
 21 | else:  # if gmpy2 is not available: do not use 'griffiths_2004'
 22 |     metrics_using_gmpy2 = ()
 23 | 
 24 | if importlib.util.find_spec('gensim'):
 25 |     metrics_using_gensim = (
 26 |         'coherence_gensim_u_mass',      # same as coherence_mimno_2011
 27 |         'coherence_gensim_c_v',
 28 |         'coherence_gensim_c_uci',
 29 |         'coherence_gensim_c_npmi'
 30 |     )
 31 | else:
 32 |     metrics_using_gensim = ()
 33 | 
 34 | 
 35 | #: Available metrics for lda (``"griffiths_2004"``, ``"held_out_documents_wallach09"`` are added when package gmpy2
 36 | #: is installed, several ``"coherence_gensim_"`` metrics are added when package gensim is installed).
 37 | AVAILABLE_METRICS = (
 38 |     'loglikelihood',                # simply uses the last reported log likelihood as fallback
 39 |     'cao_juan_2009',
 40 |     'arun_2010',
 41 |     'coherence_mimno_2011',
 42 | ) + metrics_using_gmpy2 + metrics_using_gensim
 43 | 
 44 | #: Metrics used by default.
 45 | DEFAULT_METRICS = (
 46 |     'cao_juan_2009',
 47 |     'coherence_mimno_2011'
 48 | )
 49 | 
 50 | 
 51 | logger = logging.getLogger('tmtoolkit')
 52 | 
 53 | 
 54 | #%% Specialized classes for parallel processing
 55 | 
 56 | 
 57 | class MultiprocModelsWorkerLDA(MultiprocModelsWorkerABC):
 58 |     """
 59 |     Specialized parallel model computations worker for lda.
 60 |     """
 61 | 
 62 |     package_name = 'lda'
 63 | 
 64 |     def fit_model(self, data, params):
 65 |         from lda import LDA
 66 |         lda_instance = LDA(**params)
 67 |         lda_instance.fit(data)
 68 | 
 69 |         return lda_instance
 70 | 
 71 | 
 72 | class MultiprocEvaluationWorkerLDA(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerLDA):
 73 |     """
 74 |     Specialized parallel model evaluations worker for lda.
 75 |     """
 76 | 
 77 |     def fit_model(self, data, params):
 78 |         if list(self.eval_metric) != ['held_out_documents_wallach09'] or self.return_models:
 79 |             lda_instance = super(MultiprocEvaluationWorkerLDA, self).fit_model(data, params)
 80 |         else:
 81 |             lda_instance = None
 82 | 
 83 |         results = {}
 84 |         if self.return_models:
 85 |             results['model'] = lda_instance
 86 | 
 87 |         for metric in self.eval_metric:
 88 |             if metric == 'griffiths_2004':
 89 |                 if 'griffiths_2004_burnin' in self.eval_metric_options:  # discard specific number of burnin iterations
 90 |                     burnin_iterations = self.eval_metric_options['griffiths_2004_burnin']
 91 |                     burnin_samples = burnin_iterations // lda_instance.refresh
 92 | 
 93 |                     if burnin_samples >= len(lda_instance.loglikelihoods_):
 94 |                         raise ValueError('`griffiths_2004_burnin` set too high (%d) – not enough samples to use. should be less than %d.'
 95 |                                          % (burnin_iterations, len(lda_instance.loglikelihoods_) * lda_instance.refresh))
 96 |                 else:   # default: discard first 50% of the likelihood samples
 97 |                     burnin_samples = len(lda_instance.loglikelihoods_) // 2
 98 | 
 99 |                 logliks = lda_instance.loglikelihoods_[burnin_samples:]
100 |                 if logliks:
101 |                     res = metric_griffiths_2004(logliks)
102 |                 else:
103 |                     raise ValueError('no log likelihood samples for calculation of `metric_griffiths_2004`')
104 |             elif metric == 'cao_juan_2009':
105 |                 res = metric_cao_juan_2009(lda_instance.topic_word_)
106 |             elif metric == 'arun_2010':
107 |                 res = metric_arun_2010(lda_instance.topic_word_, lda_instance.doc_topic_, data.sum(axis=1))
108 |             elif metric == 'coherence_mimno_2011':
109 |                 default_top_n = min(20, lda_instance.topic_word_.shape[1])
110 |                 res = metric_coherence_mimno_2011(lda_instance.topic_word_, data,
111 |                                                   top_n=self.eval_metric_options.get(
112 |                                                       'coherence_mimno_2011_top_n', default_top_n),
113 |                                                   eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1),
114 |                                                   include_prob=self.eval_metric_options.get(
115 |                                                       'coherence_mimno_2011_include_prob', False),
116 |                                                   normalize=self.eval_metric_options.get(
117 |                                                       'coherence_mimno_2011_normalize', False),
118 |                                                   return_mean=True)
119 |             elif metric.startswith('coherence_gensim_'):
120 |                 if 'coherence_gensim_vocab' not in self.eval_metric_options:
121 |                     raise ValueError('corpus vocabulary must be passed as `coherence_gensim_vocab`')
122 | 
123 |                 coh_measure = metric[len('coherence_gensim_'):]
124 |                 default_top_n = min(20, lda_instance.topic_word_.shape[1])
125 |                 metric_kwargs = {
126 |                     'measure': coh_measure,
127 |                     'topic_word_distrib': lda_instance.topic_word_,
128 |                     'dtm': data,
129 |                     'vocab': self.eval_metric_options['coherence_gensim_vocab'],
130 |                     'return_mean': True,
131 |                     'processes': 1,
132 |                     'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n),
133 |                 }
134 | 
135 |                 if coh_measure != 'u_mass':
136 |                     if 'coherence_gensim_texts' not in self.eval_metric_options:
137 |                         raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other '
138 |                                          'coherence measure than `u_mass`')
139 |                     metric_kwargs.update({
140 |                         'texts': self.eval_metric_options['coherence_gensim_texts']
141 |                     })
142 | 
143 |                 metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {}))
144 | 
145 |                 res = metric_coherence_gensim(**metric_kwargs)
146 |             elif metric == 'held_out_documents_wallach09':
147 |                 n_folds = self.eval_metric_options.get('held_out_documents_wallach09_n_folds', 5)
148 |                 shuffle_docs = self.eval_metric_options.get('held_out_documents_wallach09_shuffle_docs', True)
149 |                 n_samples = self.eval_metric_options.get('held_out_documents_wallach09_n_samples', 10000)
150 | 
151 |                 folds_results = []
152 |                 # TODO: parallelize this
153 |                 for fold, train, test in split_dtm_for_cross_validation(data, n_folds, shuffle_docs=shuffle_docs):
154 |                     logger.info('> fold %d/%d of cross validation with %d held-out documents and %d training documents'
155 |                                 % (fold+1, n_folds, test.shape[0], train.shape[0]))
156 | 
157 |                     model_train = super(MultiprocEvaluationWorkerLDA, self).fit_model(train, params)
158 |                     theta_test = model_train.transform(test)
159 | 
160 |                     folds_results.append(metric_held_out_documents_wallach09(test, theta_test, model_train.topic_word_,
161 |                                                                              model_train.alpha, n_samples=n_samples))
162 | 
163 |                 logger.debug('> cross validation results with metric "%s": %s' % (metric, str(folds_results)))
164 |                 res = np.mean(folds_results)
165 |             elif metric == 'loglikelihood':
166 |                 res = lda_instance.loglikelihoods_[-1]
167 |             else:
168 |                 raise ValueError('metric not available: "%s"' % metric)
169 | 
170 |             logger.info('> evaluation result with metric "%s": %f' % (metric, res))
171 |             results[metric] = res
172 | 
173 |         return results
174 | 
175 | 
176 | #%% main API functions for parallel processing
177 | 
178 | 
179 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None):
180 |     """
181 |     Compute several topic models in parallel using the "lda" package. Use a single or multiple document term matrices
182 |     `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters`
183 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
184 |     is passed.
185 | 
186 |     `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID ->
187 |     Document-Term-Matrix mapping when calculating models for multiple corpora.
188 | 
189 |     If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise
190 |     it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where
191 |     `parameter_set` is a dict of the used parameters.
192 | 
193 |     :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices
194 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
195 |                                computation
196 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
197 |     :param n_max_processes: maximum number of worker processes to spawn
198 |     :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is
199 |              a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset
200 |     """
201 |     mp_models = MultiprocModelsRunner(MultiprocModelsWorkerLDA, data, varying_parameters, constant_parameters,
202 |                                       n_max_processes=n_max_processes)
203 | 
204 |     return mp_models.run()
205 | 
206 | 
207 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False,
208 |                           metric=None, **metric_kwargs):
209 |     """
210 |     Compute several Topic Models in parallel using the "lda" package. Calculate the models using a list of varying
211 |     parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters`
212 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
213 |     is passed.
214 | 
215 |     `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix).
216 | 
217 |     Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where
218 |     `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results:
219 | 
220 |     .. code-block:: text
221 | 
222 |         [(parameter_set_1, {'<metric_name>': result_1, ...}),
223 |          ...,
224 |          (parameter_set_n, {'<metric_name>': result_n, ...})])
225 | 
226 |     .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`.
227 | 
228 |     :param data: a (sparse) 2D array/matrix
229 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
230 |                                evaluation
231 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
232 |     :param n_max_processes: maximum number of worker processes to spawn
233 |     :param return_models: if True, also return the computed models in the evaluation results
234 |     :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of
235 |                    `available_metrics`
236 |     :param metric_kwargs: dict of options for metric used metric(s)
237 |     :return: list of evaluation results for each varying parameter set as described above
238 |     """
239 |     mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerLDA, AVAILABLE_METRICS, data,
240 |                                         varying_parameters, constant_parameters,
241 |                                         metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs,
242 |                                         n_max_processes=n_max_processes, return_models=return_models)
243 | 
244 |     return mp_eval.run()
245 | 


--------------------------------------------------------------------------------
/tmtoolkit/topicmod/tm_sklearn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parallel model computation and evaluation using the `scikit-learn package <https://scikit-learn.org/>`_.
  3 | 
  4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_sklearn.AVAILABLE_METRICS`.
  5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics.
  6 | """
  7 | 
  8 | import logging
  9 | import importlib.util
 10 | 
 11 | import numpy as np
 12 | from scipy.sparse import issparse, csr_matrix
 13 | 
 14 | from ._eval_tools import split_dtm_for_cross_validation
 15 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \
 16 |     MultiprocEvaluationWorkerABC
 17 | from .evaluate import metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, \
 18 |     metric_coherence_gensim, metric_held_out_documents_wallach09
 19 | 
 20 | 
 21 | if importlib.util.find_spec('gmpy2'):
 22 |     metrics_using_gmpy2 = ('held_out_documents_wallach09', )
 23 | else:  # if gmpy2 is not available: do not use 'griffiths_2004'
 24 |     metrics_using_gmpy2 = ()
 25 | 
 26 | if importlib.util.find_spec('gensim'):
 27 |     metrics_using_gensim = (
 28 |         'coherence_gensim_u_mass',      # same as coherence_mimno_2011
 29 |         'coherence_gensim_c_v',
 30 |         'coherence_gensim_c_uci',
 31 |         'coherence_gensim_c_npmi'
 32 |     )
 33 | else:
 34 |     metrics_using_gensim = ()
 35 | 
 36 | #: Available metrics for sklearn (``"held_out_documents_wallach09"`` is added when package gmpy2
 37 | #: is installed, several ``"coherence_gensim_"`` metrics are added when package gensim is installed).
 38 | AVAILABLE_METRICS = (
 39 |     'perplexity',
 40 |     'cao_juan_2009',
 41 |     'arun_2010',
 42 |     'coherence_mimno_2011',
 43 |     'coherence_gensim_u_mass',  # same as coherence_mimno_2011
 44 |     'coherence_gensim_c_v',
 45 |     'coherence_gensim_c_uci',
 46 |     'coherence_gensim_c_npmi',
 47 | ) + metrics_using_gmpy2 + metrics_using_gensim
 48 | 
 49 | #: Metrics used by default.
 50 | DEFAULT_METRICS = (
 51 |     'perplexity',
 52 |     'cao_juan_2009',
 53 |     'coherence_mimno_2011'
 54 | )
 55 | 
 56 | 
 57 | #%% Specialized classes for parallel processing
 58 | 
 59 | 
 60 | logger = logging.getLogger('tmtoolkit')
 61 | 
 62 | 
 63 | class MultiprocModelsWorkerSklearn(MultiprocModelsWorkerABC):
 64 |     """
 65 |     Specialized parallel model computations worker for sklearn.
 66 |     """
 67 | 
 68 |     package_name = 'sklearn'
 69 | 
 70 |     def fit_model(self, data, params, return_data=False):
 71 |         from sklearn.decomposition import LatentDirichletAllocation
 72 | 
 73 |         if issparse(data):
 74 |             if data.format != 'csr':
 75 |                 data = data.tocsr()
 76 |         else:
 77 |             data = csr_matrix(data)
 78 | 
 79 |         lda_instance = LatentDirichletAllocation(**params)
 80 |         lda_instance.fit(data)
 81 | 
 82 |         if return_data:
 83 |             return lda_instance, data
 84 |         else:
 85 |             return lda_instance
 86 | 
 87 | 
 88 | class MultiprocEvaluationWorkerSklearn(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerSklearn):
 89 |     """
 90 |     Specialized parallel model evaluations worker for sklearn.
 91 |     """
 92 | 
 93 |     def fit_model(self, data, params, return_data=False):
 94 |         lda_instance, data = super(MultiprocEvaluationWorkerSklearn, self).fit_model(data, params,
 95 |                                                                                      return_data=True)
 96 | 
 97 |         topic_word_distrib = _get_normalized_topic_word_distrib(lda_instance)
 98 | 
 99 |         results = {}
100 |         if self.return_models:
101 |             results['model'] = lda_instance
102 | 
103 |         for metric in self.eval_metric:
104 |             if metric == 'cao_juan_2009':
105 |                 res = metric_cao_juan_2009(topic_word_distrib)
106 |             elif metric == 'arun_2010':
107 |                 res = metric_arun_2010(topic_word_distrib, lda_instance.transform(data), data.sum(axis=1))
108 |             elif metric == 'coherence_mimno_2011':
109 |                 default_top_n = min(20, topic_word_distrib.shape[1])
110 |                 res = metric_coherence_mimno_2011(topic_word_distrib, data,
111 |                                                   top_n=self.eval_metric_options.get(
112 |                                                       'coherence_mimno_2011_top_n', default_top_n),
113 |                                                   eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1),
114 |                                                   include_prob=self.eval_metric_options.get(
115 |                                                       'coherence_mimno_2011_include_prob', False),
116 |                                                   normalize=self.eval_metric_options.get(
117 |                                                       'coherence_mimno_2011_normalize', False),
118 |                                                   return_mean=True)
119 |             elif metric.startswith('coherence_gensim_'):
120 |                 if 'coherence_gensim_vocab' not in self.eval_metric_options:
121 |                     raise ValueError('corpus vocabulary must be passed as `coherence_gensim_vocab`')
122 | 
123 |                 coh_measure = metric[len('coherence_gensim_'):]
124 |                 default_top_n = min(20, topic_word_distrib.shape[1])
125 |                 metric_kwargs = {
126 |                     'measure': coh_measure,
127 |                     'topic_word_distrib': topic_word_distrib,
128 |                     'dtm': data,
129 |                     'vocab': self.eval_metric_options['coherence_gensim_vocab'],
130 |                     'return_mean': True,
131 |                     'processes': 1,
132 |                     'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n),
133 |                 }
134 | 
135 |                 if coh_measure != 'u_mass':
136 |                     if 'coherence_gensim_texts' not in self.eval_metric_options:
137 |                         raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other '
138 |                                          'coherence measure than `u_mass`')
139 |                     metric_kwargs.update({
140 |                         'texts': self.eval_metric_options['coherence_gensim_texts']
141 |                     })
142 | 
143 |                 metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {}))
144 | 
145 |                 res = metric_coherence_gensim(**metric_kwargs)
146 |             elif metric == 'held_out_documents_wallach09':
147 |                 n_folds = self.eval_metric_options.get('held_out_documents_wallach09_n_folds', 5)
148 |                 shuffle_docs = self.eval_metric_options.get('held_out_documents_wallach09_shuffle_docs', True)
149 |                 n_samples = self.eval_metric_options.get('held_out_documents_wallach09_n_samples', 10000)
150 | 
151 |                 folds_results = []
152 |                 # TODO: parallelize this
153 |                 for fold, train, test in split_dtm_for_cross_validation(data, n_folds, shuffle_docs=shuffle_docs):
154 |                     logger.info('> fold %d/%d of cross validation with %d held-out documents and %d training documents'
155 |                                 % (fold+1, n_folds, test.shape[0], train.shape[0]))
156 | 
157 |                     model_train = super(MultiprocEvaluationWorkerSklearn, self).fit_model(train, params)
158 |                     theta_test = model_train.transform(test)
159 | 
160 |                     phi_train = _get_normalized_topic_word_distrib(lda_instance)
161 | 
162 |                     folds_results.append(metric_held_out_documents_wallach09(test, theta_test, phi_train,
163 |                                                                              model_train.doc_topic_prior_,
164 |                                                                              n_samples=n_samples))
165 | 
166 |                 logger.debug('> cross validation results with metric "%s": %s' % (metric, str(folds_results)))
167 |                 res = np.mean(folds_results)
168 |             elif metric == 'perplexity':
169 |                 res = lda_instance.perplexity(data)
170 |             else:
171 |                 raise ValueError('metric not available: "%s"' % metric)
172 | 
173 |             logger.info('> evaluation result with metric "%s": %f' % (metric, res))
174 |             results[metric] = res
175 | 
176 |         return results
177 | 
178 | 
179 | #%% main API functions for parallel processing
180 | 
181 | 
182 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None):
183 |     """
184 |     Compute several topic models in parallel using the "sklearn" package. Use a single or multiple document term matrices
185 |     `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters`
186 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
187 |     is passed.
188 | 
189 |     `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID ->
190 |     Document-Term-Matrix mapping when calculating models for multiple corpora.
191 | 
192 |     If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise
193 |     it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where
194 |     `parameter_set` is a dict of the used parameters.
195 | 
196 |     :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices
197 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
198 |                                computation
199 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
200 |     :param n_max_processes: maximum number of worker processes to spawn
201 |     :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is
202 |              a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset
203 |     """
204 | 
205 |     mp_models = MultiprocModelsRunner(MultiprocModelsWorkerSklearn, data, varying_parameters, constant_parameters,
206 |                                       n_max_processes=n_max_processes)
207 | 
208 |     return mp_models.run()
209 | 
210 | 
211 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False,
212 |                           metric=None, **metric_kwargs):
213 |     """
214 |     Compute several Topic Models in parallel using the "sklearn" package. Calculate the models using a list of varying
215 |     parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters`
216 |     dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
217 |     is passed.
218 | 
219 |     `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix).
220 | 
221 |     Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where
222 |     `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results:
223 | 
224 |     .. code-block:: text
225 | 
226 |         [(parameter_set_1, {'<metric_name>': result_1, ...}),
227 |          ...,
228 |          (parameter_set_n, {'<metric_name>': result_n, ...})])
229 | 
230 |     .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`.
231 | 
232 |     :param data: a (sparse) 2D array/matrix
233 |     :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
234 |                                evaluation
235 |     :param constant_parameters: dict with parameters that are the same for all parallel computations
236 |     :param n_max_processes: maximum number of worker processes to spawn
237 |     :param return_models: if True, also return the computed models in the evaluation results
238 |     :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of
239 |                    `available_metrics`
240 |     :param metric_kwargs: dict of options for metric used metric(s)
241 |     :return: list of evaluation results for each varying parameter set as described above
242 |     """
243 | 
244 |     mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerSklearn, AVAILABLE_METRICS, data,
245 |                                         varying_parameters, constant_parameters,
246 |                                         metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs,
247 |                                         n_max_processes=n_max_processes, return_models=return_models)
248 | 
249 |     return mp_eval.run()
250 | 
251 | 
252 | #%% Helper functions
253 | 
254 | def _get_normalized_topic_word_distrib(lda_instance):
255 |     return lda_instance.components_ / lda_instance.components_.sum(axis=1)[:, np.newaxis]
256 | 


--------------------------------------------------------------------------------
/tmtoolkit/types.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module with common types used in type annotations throughout this project.
 3 | 
 4 | .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 5 | """
 6 | 
 7 | from enum import IntEnum
 8 | from typing import Union
 9 | 
10 | 
11 | Proportion = IntEnum('Proportion', 'NO YES LOG', start=0)
12 | 
13 | StrOrInt = Union[str, int]
14 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | #
 6 | # The following environments are defined:
 7 | #
 8 | # - Python 3.8 to 3.10 with dependency sets:
 9 | #   - minimal
10 | #   - recommended
11 | #   - recommendedextra
12 | #   - full
13 | #
14 | # The following dependency sets are defined, which specify the "extras_require" section to choose when installing
15 | # via setup.py:
16 | #
17 | #  - minimal: no additional extras
18 | #  - recommended: textproc and wordclouds
19 | #  - recommendedextra: recommended and all topic modeling packages (lda, scikit-learn, gensim)
20 | #  - full: recommendedextra and textproc_extra, topic_modeling_eval_extra
21 | #
22 | 
23 | 
24 | [tox]
25 | envlist =
26 |     py{38,39,310}-{minimal,recommended,recommendedextra,full}
27 | 
28 | [testenv]
29 | deps = .[test]
30 | extras =
31 |     recommended: recommended
32 |     recommendedextra: recommended, lda, sklearn, gensim
33 |     full: recommended, lda, sklearn, gensim, textproc_extra, topic_modeling_eval_extra
34 | commands_pre =
35 |     - python -m tmtoolkit setup all --no-update
36 | commands =
37 |     pytest -v {posargs}
38 | 


--------------------------------------------------------------------------------