├── .github └── workflows │ ├── runtests.yml │ └── stale.yml ├── .gitignore ├── .readthedocs.yaml ├── AUTHORS.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── conftest.py ├── coverage.svg ├── doc ├── Makefile └── source │ ├── api.rst │ ├── bow.ipynb │ ├── conf.py │ ├── data │ ├── corpus_example │ │ ├── sample1.txt │ │ ├── sample2.txt │ │ └── sample3.txt │ ├── news_articles_100.pickle │ ├── news_articles_100.xlsx │ └── tm_wordclouds │ │ └── .gitignore │ ├── development.rst │ ├── getting_started.ipynb │ ├── index.rst │ ├── install.rst │ ├── intro.rst │ ├── license_note.rst │ ├── preprocessing.ipynb │ ├── text_corpora.ipynb │ ├── topic_modeling.ipynb │ └── version_history.rst ├── examples ├── README.md ├── __init__.py ├── _benchmarktools.py ├── benchmark_en_newsarticles.py ├── bundestag18_tfidf.py ├── data │ ├── ap.pickle │ ├── bt18_full.zip │ ├── bt18_sample_1000.pickle │ ├── gensim_evaluation_plot.png │ └── nips.pickle ├── gensim_evaluation.py ├── minimal_tfidf.py ├── topicmod_ap_nips_eval.py └── topicmod_lda.py ├── requirements.txt ├── requirements_doc.txt ├── scripts ├── fulldata │ ├── .gitignore │ └── README.md ├── nips_data.py ├── prepare_corpora.R └── tmp │ └── .gitignore ├── setup.py ├── tests ├── __init__.py ├── _testtextdata.py ├── _testtools.py ├── data │ ├── .gitignore │ ├── 100NewsArticles.csv │ ├── 100NewsArticles.xlsx │ ├── 3ExampleDocs.xlsx │ ├── bt18_speeches_sample.csv │ ├── gutenberg │ │ ├── kafka_verwandlung.txt │ │ └── werther │ │ │ ├── goethe_werther1.txt │ │ │ └── goethe_werther2.txt │ ├── tiny_model_reuters_5_topics.pickle │ └── zipdata.zip ├── test_bow.py ├── test_corpus.py ├── test_corpusimport.py ├── test_tokenseq.py ├── test_topicmod__eval_tools.py ├── test_topicmod_evaluate.py ├── test_topicmod_model_io.py ├── test_topicmod_model_stats.py ├── test_topicmod_visualize.py └── test_utils.py ├── tmtoolkit ├── __init__.py ├── __main__.py ├── bow │ ├── __init__.py │ ├── bow_stats.py │ └── dtm.py ├── corpus │ ├── __init__.py │ ├── _common.py │ ├── _corpus.py │ ├── _corpusfuncs.py │ ├── _document.py │ ├── _nltk_extras.py │ └── visualize.py ├── data │ ├── de │ │ └── parlspeech-v2-sample-bundestag.zip │ ├── en │ │ ├── News100.zip │ │ ├── NewsArticles.zip │ │ └── parlspeech-v2-sample-houseofcommons.zip │ ├── es │ │ └── parlspeech-v2-sample-congreso.zip │ └── nl │ │ └── parlspeech-v2-sample-tweedekamer.zip ├── tokenseq.py ├── topicmod │ ├── __init__.py │ ├── _common.py │ ├── _eval_tools.py │ ├── evaluate.py │ ├── model_io.py │ ├── model_stats.py │ ├── parallel.py │ ├── tm_gensim.py │ ├── tm_lda.py │ ├── tm_sklearn.py │ └── visualize.py ├── types.py └── utils.py └── tox.ini /.github/workflows/runtests.yml: -------------------------------------------------------------------------------- 1 | # GitHub actions workflow for testing tmtoolkit 2 | # Runs tests on Ubuntu, MacOS and Windows with Python versions 3.8, 3.9 and 3.10 each, which means 9 jobs are spawned. 3 | # Tests are run using tox (https://tox.wiki/). 4 | # 5 | # author: Markus Konrad 6 | 7 | name: run tests 8 | 9 | on: 10 | push: 11 | branches: 12 | - master 13 | - develop 14 | - 'release*' 15 | 16 | jobs: 17 | build: 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | matrix: 21 | os: [ubuntu-latest, macos-latest, windows-latest] 22 | python-version: ["3.8", "3.9", "3.10"] 23 | testsuite: ["minimal", "full"] 24 | steps: 25 | - uses: actions/checkout@v2 26 | - name: set up python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v2 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | cache: 'pip' 31 | - name: install system dependencies (linux) 32 | if: runner.os == 'Linux' 33 | # only managed to install system dependencies on Linux runners 34 | run: | 35 | sudo apt update 36 | sudo apt install libgmp-dev libmpfr-dev libmpc-dev 37 | - name: install python dependencies 38 | run: | 39 | python -m pip install --upgrade pip 40 | pip install tox 41 | - name: run tox (linux) 42 | # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ... 43 | if: runner.os == 'Linux' 44 | run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci 45 | - name: run tox (macos or windows - minimal) 46 | if: runner.os != 'Linux' && matrix.testsuite == 'minimal' 47 | run: tox -e py-minimal -- --hypothesis-profile=ci 48 | - name: run tox (macos or windows - recommendedextra) 49 | # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite 50 | if: runner.os != 'Linux' && matrix.testsuite == 'full' 51 | run: tox -e py-recommendedextra -- --hypothesis-profile=ci 52 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | on: 3 | schedule: 4 | - cron: "23 3 * * *" 5 | 6 | jobs: 7 | close-issues: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | pull-requests: write 12 | steps: 13 | - uses: actions/stale@v3 14 | with: 15 | days-before-issue-stale: 30 16 | days-before-issue-close: 14 17 | stale-issue-label: "stale" 18 | stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." 19 | close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." 20 | days-before-pr-stale: -1 21 | days-before-pr-close: -1 22 | repo-token: ${{ secrets.GITHUB_TOKEN }} 23 | 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | .idea/ 3 | **/__pycache__ 4 | *.pyc 5 | .hypothesis 6 | build/ 7 | dist/ 8 | *.egg-info/ 9 | .~lock.* 10 | examples/data/*.pickle 11 | !examples/data/ap.pickle 12 | !examples/data/nips.pickle 13 | !examples/data/bt18_sample_1000.pickle 14 | **/.ipynb_checkpoints/ 15 | .pytest_cache/ 16 | .covreport/ 17 | .tox/ 18 | .Rhistory 19 | doc/source/data/corpus_norm.pickle 20 | .coverage 21 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the doc/ directory with Sphinx 9 | sphinx: 10 | configuration: doc/source/conf.py 11 | 12 | # Set the version of Python and other tools you might need 13 | build: 14 | os: ubuntu-20.04 15 | tools: 16 | python: "3.9" 17 | 18 | # Optionally set the version of Python and requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: requirements_doc.txt 22 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | 3 | ## Maintainer / main developer 4 | 5 | [Markus Konrad](https://github.com/internaut) @ [WZB](https://github.com/WZBSocialScienceCenter/) 6 | 7 | ## Contributors 8 | 9 | Sorted by date of first contribution: 10 | 11 | * [Matt Cooper](https://github.com/mcooper) 12 | * [Dominik Domhoff](https://github.com/ddomhoff) 13 | * [Christof Kälin](https://github.com/christofkaelin) 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.md 2 | include conftest.py 3 | include LICENSE 4 | include README.rst 5 | include requirements.txt 6 | include requirements_doc.txt 7 | graft doc/source 8 | prune doc/source/.ipynb_* 9 | graft tmtoolkit/data 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | run_tests: 2 | PYTHONPATH=. pytest -l tests/ 3 | 4 | cov_tests: 5 | PYTHONPATH=. pytest --cov-report html:.covreport --cov=tmtoolkit tests/ 6 | coverage-badge -f -o coverage.svg 7 | #rm .coverage* 8 | 9 | sdist: 10 | python setup.py sdist 11 | 12 | wheel: 13 | python setup.py bdist_wheel 14 | 15 | readme: 16 | cat doc/source/intro.rst > README.rst 17 | echo >> README.rst 18 | echo >> README.rst 19 | doc/source/install.rst >> README.rst 20 | echo >> README.rst 21 | echo >> README.rst 22 | cat doc/source/license_note.rst >> README.rst 23 | 24 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | **This repository is archived. Further development of tmtoolkit has moved to https://github.com/internaut/tmtoolkit.** 2 | 3 | ------------ 4 | 5 | 6 | tmtoolkit: Text mining and topic modeling toolkit 7 | ================================================= 8 | 9 | *tmtoolkit* is a set of tools for text mining and topic modeling with Python developed especially for the use in the 10 | social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation 11 | and a clear programming interface while offering good performance on large datasets by the means of vectorized 12 | operations (via NumPy) and parallel computation (using Python's *multiprocessing* module and the 13 | `loky `_ package). The basis of tmtoolkit's text mining capabilities are built around 14 | `SpaCy `_, which offers a `many language models `_. 15 | 16 | The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org `_ and 17 | the GitHub code repository is on 18 | `github.com/WZBSocialScienceCenter/tmtoolkit `_. 19 | 20 | **Upgrade note:** 21 | 22 | Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API 23 | for text processing and mining which is incompatible with prior versions. It's advisable to first read the 24 | first three chapters of the `tutorial `_ 25 | to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely 26 | remove the old version prior to upgrading. See the 27 | `installation instructions `_. 28 | 29 | Requirements and installation 30 | ----------------------------- 31 | 32 | **tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).** 33 | 34 | The tmtoolkit package is highly modular and tries to install as few dependencies as possible. For requirements and 35 | installation procedures, please have a look at the 36 | `installation section in the documentation `_. For short, 37 | the recommended way of installing tmtoolkit is to create and activate a 38 | `Python Virtual Environment ("venv") `_ and then install tmtoolkit with 39 | a recommended set of dependencies and a list of language models via the following: 40 | 41 | .. code-block:: text 42 | 43 | pip install -U "tmtoolkit[recommended]" 44 | # add or remove language codes in the list for installing the models that you need; 45 | # don't use spaces in the list of languages 46 | python -m tmtoolkit setup en,de 47 | 48 | Again, you should have a look at the detailed 49 | `installation instructions `_ in order to install additional 50 | packages that enable more features such as topic modeling. 51 | 52 | Features 53 | -------- 54 | 55 | Text preprocessing 56 | ^^^^^^^^^^^^^^^^^^ 57 | 58 | The tmtoolkit package offers several text preprocessing and text mining methods, including: 59 | 60 | - `tokenization, sentence segmentation, part-of-speech (POS) tagging, named-entity recognition (NER) `_ (via SpaCy) 61 | - `lemmatization and token normalization `_ 62 | - extensive `pattern matching capabilities `_ 63 | (exact matching, regular expressions or "glob" patterns) to be used in many 64 | methods of the package, e.g. for filtering on token or document level, or for 65 | `keywords-in-context (KWIC) `_ 66 | - adding and managing 67 | `custom document and token attributes `_ 68 | - accessing text corpora along with their 69 | `document and token attributes as dataframes `_ 70 | - calculating and `visualizing corpus summary statistics `_ 71 | - finding out and joining `collocations `_ 72 | - `splitting and sampling corpora `_ 73 | - generating `n-grams `_ 74 | - generating `sparse document-term matrices `_ 75 | 76 | Wherever possible and useful, these methods can operate in parallel to speed up computations with large datasets. 77 | 78 | Topic modeling 79 | ^^^^^^^^^^^^^^ 80 | 81 | * `model computation in parallel `_ for different copora 82 | and/or parameter sets 83 | * support for `lda `_, 84 | `scikit-learn `_ 85 | and `gensim `_ topic modeling backends 86 | * `evaluation of topic models `_ (e.g. in order to an optimal number 87 | of topics for a given dataset) using several implemented metrics: 88 | 89 | * model coherence (`Mimno et al. 2011 `_) or with 90 | `metrics implemented in Gensim `_) 91 | * KL divergence method (`Arun et al. 2010 `_) 92 | * probability of held-out documents (`Wallach et al. 2009 `_) 93 | * pair-wise cosine distance method (`Cao Juan et al. 2009 `_) 94 | * harmonic mean method (`Griffiths, Steyvers 2004 `_) 95 | * the loglikelihood or perplexity methods natively implemented in lda, sklearn or gensim 96 | 97 | * `plotting of evaluation results `_ 98 | * `common statistics for topic models `_ such as 99 | word saliency and distinctiveness (`Chuang et al. 2012 `_), topic-word 100 | relevance (`Sievert and Shirley 2014 `_) 101 | * `finding / filtering topics with pattern matching `_ 102 | * `export estimated document-topic and topic-word distributions to Excel 103 | `_ 104 | * `visualize topic-word distributions and document-topic distributions `_ 105 | as word clouds or heatmaps 106 | * model coherence (`Mimno et al. 2011 `_) for individual topics 107 | * integrate `PyLDAVis `_ to visualize results 108 | 109 | Other features 110 | ^^^^^^^^^^^^^^ 111 | 112 | - loading and cleaning of raw text from 113 | `text files, tabular files (CSV or Excel), ZIP files or folders `_ 114 | - `splitting and joining documents `_ 115 | - `common statistics and transformations for document-term matrices `_ like word cooccurrence and *tf-idf* 116 | 117 | Limits 118 | ------ 119 | 120 | * all languages are supported, for which `SpaCy language models `_ are available 121 | * all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example 122 | `Gensim `_ supports) 123 | 124 | 125 | Contribute 126 | ---------- 127 | 128 | If you'd like to contribute, please read the `developer documentation `_ first. 129 | 130 | 131 | License 132 | ------- 133 | 134 | Code licensed under `Apache License 2.0 `_. 135 | See `LICENSE `_ file. 136 | 137 | .. |pypi| image:: https://badge.fury.io/py/tmtoolkit.svg 138 | :target: https://badge.fury.io/py/tmtoolkit 139 | :alt: PyPI Version 140 | 141 | .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/tmtoolkit 142 | :target: https://pypi.org/project/tmtoolkit/ 143 | :alt: Downloads from PyPI 144 | 145 | .. |runtests| image:: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml/badge.svg 146 | :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml 147 | :alt: GitHub Actions CI Build Status 148 | 149 | .. |coverage| image:: https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/master/coverage.svg?sanitize=true 150 | :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/tests 151 | :alt: Coverage status 152 | 153 | .. |rtd| image:: https://readthedocs.org/projects/tmtoolkit/badge/?version=latest 154 | :target: https://tmtoolkit.readthedocs.io/en/latest/?badge=latest 155 | :alt: Documentation Status 156 | 157 | .. |zenodo| image:: https://zenodo.org/badge/109812180.svg 158 | :target: https://zenodo.org/badge/latestdoi/109812180 159 | :alt: Citable Zenodo DOI 160 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration for tests with pytest 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | from hypothesis import settings, HealthCheck 8 | 9 | 10 | # set default timeout deadline 11 | settings.register_profile('default', deadline=5000) 12 | 13 | # profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck 14 | # and set the timeout deadline very high (60 sec.) 15 | settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000) 16 | 17 | # load default settings profile 18 | settings.load_profile('default') 19 | -------------------------------------------------------------------------------- /coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 83% 19 | 83% 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | notebooks: 16 | jupyter nbconvert --to notebook --execute --inplace --ExecutePreprocessor.timeout=600 --PlainTextFormatter.max_seq_length=20 source/*.ipynb 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /doc/source/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API 4 | === 5 | 6 | tmtoolkit.bow 7 | ------------- 8 | 9 | tmtoolkit.bow.bow_stats 10 | ^^^^^^^^^^^^^^^^^^^^^^^ 11 | 12 | .. automodule:: tmtoolkit.bow.bow_stats 13 | :members: 14 | 15 | tmtoolkit.bow.dtm 16 | ^^^^^^^^^^^^^^^^^ 17 | 18 | .. automodule:: tmtoolkit.bow.dtm 19 | :members: 20 | 21 | 22 | tmtoolkit.corpus 23 | ---------------- 24 | 25 | Corpus class and corpus functions 26 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 27 | 28 | .. automodule:: tmtoolkit.corpus 29 | :members: 30 | :imported-members: 31 | :exclude-members: find_spec, strip_tags, numbertoken_to_magnitude, simplify_unicode_chars, visualize 32 | 33 | Functions to visualize corpus summary statistics 34 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 35 | 36 | .. automodule:: tmtoolkit.corpus.visualize 37 | :members: 38 | 39 | 40 | tmtoolkit.tokenseq 41 | ------------------ 42 | 43 | .. automodule:: tmtoolkit.tokenseq 44 | :members: 45 | 46 | 47 | tmtoolkit.topicmod 48 | ------------------ 49 | 50 | .. automodule:: tmtoolkit.topicmod 51 | :members: 52 | 53 | Evaluation metrics for Topic Modeling 54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 55 | 56 | .. automodule:: tmtoolkit.topicmod.evaluate 57 | :members: 58 | 59 | 60 | Printing, importing and exporting topic model results 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 62 | 63 | .. automodule:: tmtoolkit.topicmod.model_io 64 | :members: 65 | 66 | 67 | Statistics for topic models and BoW matrices 68 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 69 | 70 | .. automodule:: tmtoolkit.topicmod.model_stats 71 | :members: 72 | 73 | 74 | Parallel model fitting and evaluation with lda 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 76 | 77 | .. automodule:: tmtoolkit.topicmod.tm_lda 78 | :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models 79 | 80 | 81 | Parallel model fitting and evaluation with scikit-learn 82 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 83 | 84 | .. automodule:: tmtoolkit.topicmod.tm_sklearn 85 | :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models 86 | 87 | 88 | Parallel model fitting and evaluation with Gensim 89 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 90 | 91 | .. automodule:: tmtoolkit.topicmod.tm_gensim 92 | :members: AVAILABLE_METRICS, DEFAULT_METRICS, compute_models_parallel, evaluate_topic_models 93 | 94 | 95 | Visualize topic models and topic model evaluation results 96 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 97 | 98 | Wordclouds from topic models 99 | """""""""""""""""""""""""""" 100 | 101 | .. autodata:: tmtoolkit.topicmod.visualize.DEFAULT_WORDCLOUD_KWARGS 102 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_for_topic_words 103 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_for_document_topics 104 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordcloud_from_probabilities_and_words 105 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordcloud_from_weights 106 | .. autofunction:: tmtoolkit.topicmod.visualize.write_wordclouds_to_folder 107 | .. autofunction:: tmtoolkit.topicmod.visualize.generate_wordclouds_from_distribution 108 | 109 | Plot heatmaps for topic models 110 | """""""""""""""""""""""""""""" 111 | 112 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_doc_topic_heatmap 113 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_topic_word_heatmap 114 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_heatmap 115 | 116 | Plot probability distribution rankings for topic models 117 | """"""""""""""""""""""""""""""""""""""""""""""""""""""" 118 | 119 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_topic_word_ranked_prob 120 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_doc_topic_ranked_prob 121 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_prob_distrib_ranked_prob 122 | 123 | Plot topic model evaluation results 124 | """"""""""""""""""""""""""""""""""" 125 | 126 | .. autofunction:: tmtoolkit.topicmod.visualize.plot_eval_results 127 | 128 | Other functions 129 | """"""""""""""" 130 | 131 | .. autofunction:: tmtoolkit.topicmod.visualize.parameters_for_ldavis 132 | 133 | 134 | Base classes for parallel model fitting and evaluation 135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 136 | 137 | .. automodule:: tmtoolkit.topicmod.parallel 138 | :members: 139 | 140 | 141 | tmtoolkit.utils 142 | --------------- 143 | 144 | .. automodule:: tmtoolkit.utils 145 | :members: 146 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | 14 | import os 15 | import sys 16 | from datetime import date 17 | 18 | import sphinx_rtd_theme 19 | 20 | sys.path.insert(0, os.path.abspath('../..')) 21 | 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = 'tmtoolkit' 26 | copyright = f'{date.today().year}, Markus Konrad' 27 | author = 'Markus Konrad' 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'nbsphinx', 37 | 'sphinx.ext.autodoc', 38 | 'sphinx_rtd_theme' 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | 48 | exclude_patterns = ['**.ipynb_checkpoints'] 49 | 50 | 51 | # If true, '()' will be appended to :func: etc. cross-reference text. 52 | add_function_parentheses = False 53 | 54 | # If true, the current module name will be prepended to all description 55 | # unit titles (such as .. function::). 56 | add_module_names = True 57 | 58 | # type hints 59 | autodoc_typehints = 'description' 60 | autodoc_typehints_format = 'short' 61 | 62 | # The name of the Pygments (syntax highlighting) style to use. 63 | pygments_style = 'sphinx' 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | 71 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 72 | html_theme = "sphinx_rtd_theme" 73 | # html_static_path = ['static'] 74 | 75 | # Output file base name for HTML help builder. 76 | htmlhelp_basename = '%sdoc' % project 77 | 78 | # Never skip __init__ 79 | 80 | def skip(app, what, name, obj, would_skip, options): 81 | if name == "__init__": 82 | return False 83 | return would_skip 84 | 85 | def setup(app): 86 | app.connect("autodoc-skip-member", skip) 87 | -------------------------------------------------------------------------------- /doc/source/data/corpus_example/sample1.txt: -------------------------------------------------------------------------------- 1 | This is the first example file. ☺ We showcase NER by just randomly listing famous people like Missy Elliott or George Harrison. 2 | -------------------------------------------------------------------------------- /doc/source/data/corpus_example/sample2.txt: -------------------------------------------------------------------------------- 1 | Here comes the second example (with HTML tags & entities). 2 | 3 | This one contains three lines of plain text which means two paragraphs. -------------------------------------------------------------------------------- /doc/source/data/corpus_example/sample3.txt: -------------------------------------------------------------------------------- 1 | And here we go with the third and final example file. 2 | Another line of text. 3 | 4 | §2. 5 | This is the second paragraph. 6 | 7 | The third and final paragraph. -------------------------------------------------------------------------------- /doc/source/data/news_articles_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/doc/source/data/news_articles_100.pickle -------------------------------------------------------------------------------- /doc/source/data/news_articles_100.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/doc/source/data/news_articles_100.xlsx -------------------------------------------------------------------------------- /doc/source/data/tm_wordclouds/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | 6 | -------------------------------------------------------------------------------- /doc/source/development.rst: -------------------------------------------------------------------------------- 1 | .. _development: 2 | 3 | Development 4 | =========== 5 | 6 | This part of the documentation serves as developer documentation, i.e. a help for those who want to contribute to the development of the package. 7 | 8 | 9 | Project overview 10 | ---------------- 11 | 12 | This project aims to provide a Python package that allows text processing, text mining and topic modeling with 13 | 14 | - easy installation, 15 | - extensive documentation, 16 | - clear functional programming interface, 17 | - good performance on large datasets. 18 | 19 | All computations need to be performed in memory. Streaming data from disk is not supported so far. 20 | 21 | The package is written in Python and uses other packages for key tasks: 22 | 23 | - `SpaCy `_ is used for the text processing and text mining tasks 24 | - `lda `_, `gensim `_ or `scikit-learn `_ are used for computing topic models 25 | 26 | The project's packages are published to the `Python Package Index PyPI `_. 27 | 28 | The package's dependencies are only installed on demand. There's a setup routine that provides an interface for easy installation of SpaCy's language models. 29 | 30 | Text processing and normalization is often used to construct a Bag-of-Words (BoW) model which in turn is the input for topic models. 31 | 32 | 33 | Contributing to tmtoolkit 34 | ------------------------- 35 | 36 | If you want to contribute to tmtoolkit, you can create code or documentation patches (updates) and submit them as `pull requests `_ on GitHub. The first thing to do for this is to fork the `GitHub repository `_ and to clone it on your local machine. It's best to create a separate branch for your updates next. You should then set up your local machine for development as follows: 37 | 38 | - create a `Python virtual environment `_ – make sure that the Python version you're using for this is supported by tmtoolkit 39 | - update pip via ``pip install -U pip`` 40 | - if you're planning to contribute to the code or to the tutorials in the documentation: 41 | 42 | - install *all* dependencies via ``pip install -r requirements.txt`` 43 | - run the tmtoolkit setup routine via ``python -m tmtoolkit setup all`` to install the required language models 44 | - check that everything works by running all tests via ``pytest tests/`` 45 | 46 | - if you're *only* planning to contribute to the documentation (without the tutorials which are Jupyter Notebooks): 47 | 48 | - install dependencies for documentation via ``pip install -r requirements_doc.txt`` 49 | 50 | You can then start working on the code or documentation. Make sure to run the tests and/or create new tests when you provide code updates in your pull request. You should also read this developer documentation completely before diving into the code. 51 | 52 | 53 | Folder structure 54 | ---------------- 55 | 56 | The project's root folder contains files for documentation generation (``.readthedocs.yaml``), testing (``conftest.py``, ``coverage.svg``, ``tox.ini``) as well as project management and package building (``Makefile``, ``MANIFEST.in``, ``setup.py``). The subfolders include: 57 | 58 | - ``.github/worflows``: provides Continuous Integration (CI) configuration for *GitHub Actions*, 59 | - ``doc``: documentation source and built documentation files, 60 | - ``examples``: example scripts and data to show some of the features (most features are better explained in the tutorial which is part of the documentation), 61 | - ``scripts``: scripts used for preparing datasets that come along with the package, 62 | - ``tests``: test suite, 63 | - ``tmtoolkit``: package source code. 64 | 65 | 66 | Packaging and dependency management 67 | ----------------------------------- 68 | 69 | This package uses `setuptools `_ for packaging. All package metadata and dependencies are defined in ``setup.py``. Since tmtoolkit allows installing dependencies on demand, there are several installation options defined in ``setup.py``. For development, the most important are: 70 | 71 | - ``[dev]``: installs packages for development and packaging 72 | - ``[test]``: installs packages for testing tmtoolkit 73 | - ``[doc]``: installs packages for generating the documentation 74 | - ``[all]``: installs all required and optional packages – recommended for development 75 | 76 | The ``requirements.txt`` and ``requirements_doc.txt`` files simply point to the ``[all]`` and ``[doc]`` installation options. 77 | 78 | The ``Makefile`` in the root folder contains targets for generating a Python *Wheel* package (``make wheel``) and a Python source distribution package (``make sdist``). 79 | 80 | 81 | Built-in datasets 82 | ----------------- 83 | 84 | All built-in datasets reside in ``tmtoolkit/data/``, where ``LANGUAGE_CODE`` is an ISO language code. For the `ParlSpeech V2 `_ datasets, the samples are generated via the R script ``scripts/prepare_corpora.R``. The `News Articles `_ dataset is used without further processing. 85 | 86 | 87 | Automated testing 88 | ----------------- 89 | 90 | The tmtoolkit package relies on the following packages for testing: 91 | 92 | - `pytest `_ as testing framework, 93 | - `hypothesis `_ for property-based testing, 94 | - `coverage `_ for measuring test coverage of the code, 95 | - `tox `_ for checking packaging and running tests in different virtual environments. 96 | 97 | All tests are implemented in the ``tests`` directory and prefixed by ``test_``. The ``conftest.py`` file contains project-wide test configuration. The ``tox.ini`` file contains configuration for setting up the virtual environments for tox. For each release, tmtoolkit aims to support the last three major Python release versions, e.g. 3.8, 3.9 and 3.10, and all of these are tested with tox along with different dependency configurations from *minimal* to *full*. To use different versions of Python on the same system, it's recommended to use the `deadsnakes repository `_ on Ubuntu or Debian Linux. 98 | 99 | The ``Makefile`` in the root folder contains a target for generating coverage reports and the coverage badge (``make cov_tests``). 100 | 101 | 102 | Documentation 103 | ------------- 104 | 105 | The `Sphinx `_ package is used for documentation. All objects exposed by the API are documented in the Sphinx format. All other parts of the documentation reside in ``doc/source``. The configuration for Sphinx lies in ``doc/source/conf.py``. The `nbsphinx `_ package is used for generating the tutorial from Jupyter Notebooks which are also located in ``doc/source``. 106 | 107 | The ``Makefile`` in the ``doc`` folder has several targets for generating the documentation. These are: 108 | 109 | - ``make notebooks`` – run all notebooks to generate their outputs; these are stored in-place 110 | - ``make clean`` – remove everything under ``doc/build`` 111 | - ``make html`` – generate the HTML documentation from the documentation source 112 | 113 | The generated documentation then resides under ``doc/build``. 114 | 115 | The documentation is published at `tmtoolkit.readthedocs.io `_. For this, new commits to the master branch of the GitHub project or new tags are automatically built by `readthedocs.org `_. The ``.readthedocs.yaml`` file in the root folder sets up the build process for readthedocs.org. 116 | 117 | 118 | Continuous integration 119 | ---------------------- 120 | 121 | Continuous integration routines are defined via `GitHub Actions (GA) `_. For tmtoolkit, this so far only means automatic testing for new commits and releases on different machine configurations. 122 | 123 | The GA set up for the tests is done in ``.github/worflows/runtests.yml``. There are "minimal" and "full" test suites for Ubuntu, MacOS and Windows with Python versions 3.8, 3.9 and 3.10 each, which means 18 jobs are spawned. Again, tox is used for running the tests on the machines. 124 | 125 | 126 | Release management 127 | ------------------ 128 | 129 | Publishing a new release for tmtoolkit involves several steps, listed below. You may consider creating a `pre-release `_ for PyPI first before publishing a final release. 130 | 131 | 1. Preparation: 132 | 133 | - create a new branch for the release version X.Y.Z as ``releaseX.Y.Z`` 134 | - check if there are new minimum version requirements for dependencies or generally new dependencies to be added in ``setup.py`` 135 | - check if the compatible Python versions should be updated in ``setup.py`` 136 | - set the new version in ``setup.py`` and ``tmtoolkit/__init__.py`` 137 | 138 | 2. Documentation updates: 139 | 140 | - check and possibly update the tutorials – do all code examples still work and are all important features covered? 141 | - update documentation 142 | - update README 143 | - update changelog (``doc/source/version_history.rst``) 144 | 145 | 3. Testing: 146 | 147 | - run examples and check if they work 148 | - run tests locally via tox 149 | - push to GitHub repository ``develop`` or ``release*`` branch to run tests via GitHub Actions 150 | 151 | 4. Publish package to PyPI: 152 | 153 | - build source distribution via ``make sdist`` 154 | - build wheel via ``make wheel`` 155 | - check both via ``twine check dist/...`` 156 | - if checks passed, upload both to PyPI via ``twine upload dist/...`` 157 | 158 | 5. Finalization 159 | 160 | - make a new tag for the new version via ``git tag -a vX.Y.Z -m "version X.Y.Z"`` 161 | - push the new tag to the GitHub repository 162 | - create a new release from the tag in the GitHub repository 163 | - merge the development or release branch with the master branch and push the master branch to the GitHub repository 164 | - log in to `readthedocs.org `_, go to the project page, activate the current version, let it build the documentation 165 | - verify documentation on `tmtoolkit.readthedocs.io `_ 166 | 167 | If you notice a (major) mistake in a release *after* publication, you have several options like yanking the release on PyPI, publishing a post-release or updating the build number of the wheel. See `this blog post `_ for more information about these options. 168 | 169 | 170 | API style 171 | --------- 172 | 173 | The tmtoolkit package provides a *functional API*. This is quite different from object-oriented APIs that are found in many other Python packages, where a programmer mainly uses classes and their methods that are exposed by an API. The tmtoolkit API on the other hand mainly exposes data structures and functions that operate on these data structures. In tmtoolkit, Python classes are usually used to implement more complex data structures such as documents or document corpora, but these classes don't provide (public) methods. Rather, they are used as function arguments, for example as in the large set of *corpus functions* that operate on text corpora as explained below. 174 | 175 | 176 | Implementation details 177 | ---------------------- 178 | 179 | Top-level module and setup routine 180 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 181 | 182 | The ``__main__.py`` file provides a command-line interface for the package. It's only purpose is to allow easy installation of SpaCy language models via the :ref:`setup routine `. The ``tokenseq`` module provides functions that operate on single (string) tokens or sequences of tokens. These functions are used mainly internally in the ``corpus`` module, but are also exposed by the API to be used from a package user. The ``utils.py`` module provides helper functions used internally throughout the package, but also to be possibly used from a package user. 183 | 184 | ``bow`` module 185 | ^^^^^^^^^^^^^^ 186 | 187 | This module provides functions for generating document-term-matrices (DTMs), which are central to the BoW concept, and some common statistics used for these matrices. 188 | 189 | ``corpus`` module 190 | ^^^^^^^^^^^^^^^^^ 191 | 192 | This is the central module for text processing and text mining. 193 | 194 | At the core of this module, there is the :class:`~tmtoolkit.corpus.Corpus` class implemented in ``corpus/_corpus.py``. It takes documents with raw text as input (i.e. a dict mapping *document labels* to text strings) and applies a SpaCy NLP pipeline to it. After that, the corpus consists of :class:`~tmtoolkit.corpus.Document` (implemented in ``corpus/_document.py``) objects which contain the textual data in tokenized form, i.e. as a sequence of *tokens* (roughly translated as "words" but other text contents such as numbers and punctuation also form separate tokens). Each token comes along with several *token attributes* which were estimated using the NLP pipeline. Examples for token attributes include the Part-of-Speech tag or the lemma. 195 | 196 | The :class:`~tmtoolkit.corpus.Document` class stores the tokens and their "standard" attributes in a *token matrix*. This matrix is of shape *(N, M)* for *N* tokens and with *M* attributes. There are at least 2 or 3 attributes: ``whitespace`` (boolean – is there a whitespace after the token?), ``token`` (the actual token, i.e. "word" type) and optionally ``sent_start`` (only given when sentence information is parsed in the NLP pipeline). 197 | 198 | The token matrix is a *uint64* matrix as it stores all information as *64 bit hash values*. Compared to sequences of strings, this reduces memory usage and allows faster computations and data modifications. E.g., when you transform a token (lets say "Hello" to "hello"), you only do one transformation, calculate one new hash value and replace each occurrence of the old hash with the new hash. The hashes are calculated with SpaCy's `hash_string `_ function. For fast conversion between token/attribute hashes and strings, the mappings are stored in a *bidirectional dictionary* using the `bidict `_ package. Each column, i.e. each attribute, in the token matrix has a separate bidict in the ``bimaps`` dictionary that is shared between a corpus and each Document object. Using bidict proved to be *much* faster than using SpaCy's built in `Vocab / StringStore `_. 199 | 200 | Besides "standard" token attributes that come from the SpaCy NLP pipeline, a user may also add custom token attributes. These are stored in each document's :attr:`~tmtoolkit.corpus.Document.custom_token_attrs` dictionary that map a attribute name to a NumPy array. These arrays are of arbitrary type and don't use the hashing approach. Besides token attributes, there are also *document attributes*. These are attributes attached to each document, for example the *document label* (unique document identifier). Custom document attributes can be added, e.g. to record the publication year of a document. Document attributes can also be of any type and are not hashed. 201 | 202 | The :class:`~tmtoolkit.corpus.Corpus` class implements a data structure for text corpora with named documents. All these documents are stored in the corpus as :class:`~tmtoolkit.corpus.Document` objects. *Corpus functions* allow to operate on Corpus objects. They are implemented in ``corpus/_corpusfuncs.py``. All corpus functions that transform/modify a corpus, have an ``inplace`` argument, by default set to ``True``. If ``inplace`` is set to ``True``, the corpus will be directly modified in-place, i.e. modifying the input corpus. If ``inplace`` is set to ``False``, a copy of the input corpus is created and all modifications are applied to this copy. The original input corpus is not altered in that case. The ``corpus_func_inplace_opt`` decorator is used to mark corpus functions with the in-place option. 203 | 204 | The :class:`~tmtoolkit.corpus.Corpus` class provides parallel processing capabilities for processing large data amounts. This can be controlled with the ``max_workers`` argument. Parallel processing is then enabled at two stages: First, it is simply enabled for the SpaCy NLP pipeline by setting up the pipeline accordingly. Second, a *reusable process pool executor* is created by the means of `loky `_. This process pool is then used in corpus functions whenever parallel execution is beneficial over serial execution. The ``parallelexec`` decorator is used to mark (inner) functions for parallel execution. 205 | 206 | 207 | ``topicmod`` module 208 | ^^^^^^^^^^^^^^^^^^^ 209 | 210 | This is the central module for computing, evaluating and analyzing topic models. 211 | 212 | In ``topicmod/evaluate.py`` there are mainly several evaluation metrics for topic models implemented. Topic models can be computed and evaluated in parallel, the base code for that is in ``topicmod/parallel.py``. Three modules use the base classes from ``topicmod/parallel.py`` to implement interfaces to popular topic modeling packages: 213 | 214 | - ``topicmod/tm_gensim.py`` for `gensim `_ 215 | - ``topicmod/tm_lda.py`` for `lda `_ 216 | - ``topicmod/tm_sklearn.py`` for `scikit-learn `_ 217 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. tmtoolkit documentation master file, created by 2 | sphinx-quickstart on Tue Aug 27 11:30:06 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: intro.rst 7 | 8 | .. include:: license_note.rst 9 | 10 | .. toctree:: 11 | :maxdepth: 4 12 | :caption: Contents: 13 | 14 | install 15 | getting_started 16 | text_corpora 17 | preprocessing 18 | bow 19 | topic_modeling 20 | api 21 | development 22 | version_history 23 | 24 | Indices and tables 25 | ================== 26 | 27 | * :ref:`genindex` 28 | * :ref:`modindex` 29 | * :ref:`search` 30 | -------------------------------------------------------------------------------- /doc/source/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Installation 4 | ============ 5 | 6 | Requirements 7 | ------------ 8 | 9 | **tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).** 10 | 11 | Requirements are automatically installed via *pip* as described below. Additional packages can also be installed 12 | via *pip* for certain use cases (see :ref:`optional_packages`). 13 | 14 | 15 | Installation instructions 16 | ------------------------- 17 | 18 | The package *tmtoolkit* is available on `PyPI `_ and can be installed via 19 | Python package manager *pip*. It is highly recommended to install tmtoolkit and its dependencies in a separate 20 | `Python Virtual Environment ("venv") `_ and upgrade to the latest 21 | *pip* version (you may also choose to install 22 | `virtualenvwrapper `_, which makes managing venvs a lot 23 | easier). 24 | 25 | Creating and activating a venv *without* virtualenvwrapper: 26 | 27 | .. code-block:: text 28 | 29 | python3 -m venv myenv 30 | 31 | # activating the environment (on Windows type "myenv\Scripts\activate.bat") 32 | source myenv/bin/activate 33 | 34 | Alternatively, creating and activating a venv *with* virtualenvwrapper: 35 | 36 | .. code-block:: text 37 | 38 | mkvirtualenv myenv 39 | 40 | # activating the environment 41 | workon myenv 42 | 43 | Upgrading pip (*only* do this when you've activated your venv): 44 | 45 | .. code-block:: text 46 | 47 | pip install -U pip 48 | 49 | The tmtoolkit package is highly modular and tries to install as few software dependencies as possible. So in order to 50 | install tmtoolkit, you can first choose if you want a minimal installation or install a recommended set of 51 | packages that enable most features. For the recommended installation, you can type **one of the following**, depending 52 | on the preferred package for topic modeling: 53 | 54 | .. code-block:: text 55 | 56 | # recommended installation without topic modeling 57 | pip install -U "tmtoolkit[recommended]" 58 | 59 | # recommended installation with "lda" for topic modeling 60 | pip install -U "tmtoolkit[recommended,lda]" 61 | 62 | # recommended installation with "scikit-learn" for topic modeling 63 | pip install -U "tmtoolkit[recommended,sklearn]" 64 | 65 | # recommended installation with "gensim" for topic modeling 66 | pip install -U "tmtoolkit[recommended,gensim]" 67 | 68 | # you may also select several topic modeling packages 69 | pip install -U "tmtoolkit[recommended,lda,sklearn,gensim]" 70 | 71 | The **minimal** installation will only install a base set of dependencies and will only enable the modules for BoW 72 | statistics, token sequence operations, topic modeling and utility functions. You can install it as follows: 73 | 74 | .. code-block:: text 75 | 76 | # alternative installation if you only want to install a minimum set of dependencies 77 | pip install -U tmtoolkit 78 | 79 | .. note:: The tmtoolkit package is about 7MB big, because it contains some example corpora. 80 | 81 | .. _setup: 82 | 83 | **After that, you should initially run tmtoolkit's setup routine.** This makes sure that all required data files are 84 | present and downloads them if necessary. You should specify a list of languages for which language models should be 85 | downloaded and installed. The list of available language models corresponds with the models provided by 86 | `SpaCy `_ (except for "multi-language"). You need to specify the two-letter ISO 87 | language code for the language models that you want to install. **Don't use spaces in the list of languages.** 88 | E.g. in order to install models for English and German: 89 | 90 | .. code-block:: text 91 | 92 | python -m tmtoolkit setup en,de 93 | 94 | To install *all* available language models, you can run: 95 | 96 | .. code-block:: text 97 | 98 | python -m tmtoolkit setup all 99 | 100 | .. _optional_packages: 101 | 102 | Optional packages 103 | ----------------- 104 | 105 | For additional features, you can install further packages using the following installation options: 106 | 107 | - ``pip install -U tmtoolkit[textproc_extra]`` for Unicode normalization and simplification and for stemming with *nltk* 108 | - ``pip install -U tmtoolkit[wordclouds]`` for generating word clouds 109 | - ``pip install -U tmtoolkit[lda]`` for topic modeling with LDA 110 | - ``pip install -U tmtoolkit[sklearn]`` for topic modeling with scikit-learn 111 | - ``pip install -U tmtoolkit[gensim]`` for topic modeling and additional evaluation metrics with Gensim 112 | - ``pip install -U tmtoolkit[topic_modeling_eval_extra]`` for topic modeling evaluation metrics ``griffiths_2004`` and 113 | ``held_out_documents_wallach09`` (see further information below) 114 | 115 | For LDA evaluation metrics ``griffiths_2004`` and ``held_out_documents_wallach09`` it is necessary to install 116 | `gmpy2 `_ for multiple-precision arithmetic. This in turn requires installing some C 117 | header libraries for GMP, MPFR and MPC. On Debian/Ubuntu systems this is done with: 118 | 119 | .. code-block:: text 120 | 121 | sudo apt install libgmp-dev libmpfr-dev libmpc-dev 122 | -------------------------------------------------------------------------------- /doc/source/intro.rst: -------------------------------------------------------------------------------- 1 | tmtoolkit: Text mining and topic modeling toolkit 2 | ================================================= 3 | 4 | |pypi| |pypi_downloads| |rtd| |runtests| |coverage| |zenodo| 5 | 6 | *tmtoolkit* is a set of tools for text mining and topic modeling with Python developed especially for the use in the 7 | social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation 8 | and a clear programming interface while offering good performance on large datasets by the means of vectorized 9 | operations (via NumPy) and parallel computation (using Python's *multiprocessing* module and the 10 | `loky `_ package). The basis of tmtoolkit's text mining capabilities are built around 11 | `SpaCy `_, which offers a `many language models `_. Currently, 12 | the following languages are supported for text mining: 13 | 14 | - Catalan 15 | - Chinese 16 | - Danish 17 | - Dutch 18 | - English 19 | - French 20 | - German 21 | - Greek 22 | - Italian 23 | - Japanese 24 | - Lithuanian 25 | - Macedonian 26 | - Norwegian Bokmål 27 | - Polish 28 | - Portuguese 29 | - Romanian 30 | - Russian 31 | - Spanish 32 | 33 | The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org `_ and 34 | the GitHub code repository is on 35 | `github.com/WZBSocialScienceCenter/tmtoolkit `_. 36 | 37 | Features 38 | -------- 39 | 40 | Text preprocessing and text mining 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 42 | 43 | The tmtoolkit package offers several text preprocessing and text mining methods, including: 44 | 45 | - `tokenization, sentence segmentation, part-of-speech (POS) tagging, named-entity recognition (NER) `_ (via SpaCy) 46 | - `lemmatization and token normalization `_ 47 | - extensive `pattern matching capabilities `_ 48 | (exact matching, regular expressions or "glob" patterns) to be used in many 49 | methods of the package, e.g. for filtering on token or document level, or for 50 | `keywords-in-context (KWIC) `_ 51 | - adding and managing 52 | `custom document and token attributes `_ 53 | - accessing text corpora along with their 54 | `document and token attributes as dataframes `_ 55 | - calculating and `visualizing corpus summary statistics `_ 56 | - finding out and joining `collocations `_ 57 | - `splitting and sampling corpora `_ 58 | - generating `n-grams `_ 59 | - generating `sparse document-term matrices `_ 60 | 61 | Wherever possible and useful, these methods can operate in parallel to speed up computations with large datasets. 62 | 63 | Topic modeling 64 | ^^^^^^^^^^^^^^ 65 | 66 | - `model computation in parallel `_ for different copora 67 | and/or parameter sets 68 | - support for `lda `_, 69 | `scikit-learn `_ 70 | and `gensim `_ topic modeling backends 71 | - `evaluation of topic models `_ (e.g. in order to an optimal number 72 | of topics for a given dataset) using several implemented metrics: 73 | 74 | - model coherence (`Mimno et al. 2011 `_) or with 75 | `metrics implemented in Gensim `_) 76 | - KL divergence method (`Arun et al. 2010 `_) 77 | - probability of held-out documents (`Wallach et al. 2009 `_) 78 | - pair-wise cosine distance method (`Cao Juan et al. 2009 `_) 79 | - harmonic mean method (`Griffiths, Steyvers 2004 `_) 80 | - the loglikelihood or perplexity methods natively implemented in lda, sklearn or gensim 81 | 82 | - `plotting of evaluation results `_ 83 | - `common statistics for topic models `_ such as 84 | word saliency and distinctiveness (`Chuang et al. 2012 `_), topic-word 85 | relevance (`Sievert and Shirley 2014 `_) 86 | - `finding / filtering topics with pattern matching `_ 87 | - `export estimated document-topic and topic-word distributions to Excel 88 | `_ 89 | - `visualize topic-word distributions and document-topic distributions `_ 90 | as word clouds or heatmaps 91 | - model coherence (`Mimno et al. 2011 `_) for individual topics 92 | - integrate `PyLDAVis `_ to visualize results 93 | 94 | 95 | Other features 96 | ^^^^^^^^^^^^^^ 97 | 98 | - loading and cleaning of raw text from 99 | `text files, tabular files (CSV or Excel), ZIP files or folders `_ 100 | - `splitting and joining documents `_ 101 | - `common statistics and transformations for document-term matrices `_ like word cooccurrence and *tf-idf* 102 | 103 | 104 | Limits 105 | ------ 106 | 107 | - only languages are supported, for which `SpaCy language models `_ are available 108 | - all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example 109 | `Gensim `_ supports) 110 | 111 | 112 | Built-in datasets 113 | ----------------- 114 | 115 | Currently tmtoolkit comes with the following built-in datasets which can be loaded via 116 | :meth:`~tmtoolkit.corpus.Corpus.from_builtin_corpus`: 117 | 118 | - *"en-NewsArticles"*: `News Articles `_ 119 | *(Dai, Tianru, 2017, "News Articles", https://doi.org/10.7910/DVN/GMFCTR, Harvard Dataverse, V1)* 120 | - random samples from `ParlSpeech V2 `_ 121 | *(Rauh, Christian; Schwalbach, Jan, 2020, "The ParlSpeech V2 data set: Full-text corpora of 6.3 million parliamentary speeches in the key legislative chambers of nine representative democracies", https://doi.org/10.7910/DVN/L4OAKN, Harvard Dataverse)* for different languages: 122 | 123 | - *"de-parlspeech-v2-sample-bundestag"* 124 | - *"en-parlspeech-v2-sample-houseofcommons"* 125 | - *"es-parlspeech-v2-sample-congreso"* 126 | - *"nl-parlspeech-v2-sample-tweedekamer"* 127 | 128 | 129 | About this documentation 130 | ------------------------ 131 | 132 | This documentation guides you in several chapters from installing tmtoolkit to its specific use cases and shows some 133 | examples with built-in corpora and other datasets. All "hands on" chapters from 134 | `Getting started `_ to `Topic modeling `_ are generated from 135 | `Jupyter Notebooks `_. If you want to follow along using these notebooks, you can download them 136 | from the `GitHub repository `_. 137 | 138 | There are also a few other examples as plain Python scripts available in the 139 | `examples folder `_ of the GitHub repository. 140 | 141 | 142 | .. |pypi| image:: https://badge.fury.io/py/tmtoolkit.svg 143 | :target: https://badge.fury.io/py/tmtoolkit 144 | :alt: PyPI Version 145 | 146 | .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/tmtoolkit 147 | :target: https://pypi.org/project/tmtoolkit/ 148 | :alt: Downloads from PyPI 149 | 150 | .. |runtests| image:: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml/badge.svg 151 | :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/actions/workflows/runtests.yml 152 | :alt: GitHub Actions CI Build Status 153 | 154 | .. |coverage| image:: https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/master/coverage.svg?sanitize=true 155 | :target: https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/tests 156 | :alt: Coverage status 157 | 158 | .. |rtd| image:: https://readthedocs.org/projects/tmtoolkit/badge/?version=latest 159 | :target: https://tmtoolkit.readthedocs.io/en/latest/?badge=latest 160 | :alt: Documentation Status 161 | 162 | .. |zenodo| image:: https://zenodo.org/badge/109812180.svg 163 | :target: https://zenodo.org/badge/latestdoi/109812180 164 | :alt: Citable Zenodo DOI 165 | -------------------------------------------------------------------------------- /doc/source/license_note.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | Code licensed under `Apache License 2.0 `_. 5 | See `LICENSE `_ file. 6 | 7 | -------------------------------------------------------------------------------- /doc/source/version_history.rst: -------------------------------------------------------------------------------- 1 | .. _changes: 2 | 3 | Version history 4 | =============== 5 | 6 | 0.11.2 - 2022-03-11 7 | ------------------- 8 | 9 | - updated `Arun et al. 2010 `_ topic model evaluation metric to better follow the paper's instructions instead of the implementation adapted from the *ldatuning* package (see `this discussion `_ – many thanks to `@hkimber `_) 10 | - updated `Mimno et al. 2011 `_ topic model evaluation metric's default argument values to be the same as used in the original paper; added an optional argument to include word probabilities into the calculations 11 | - added an example with topic model evaluation for the `AP and NIPS datasets `_ 12 | - added a `developer documentation `_ 13 | 14 | 0.11.1 - 2022-02-10 15 | ------------------- 16 | 17 | - show better error messages when dependencies for optional module ``corpus`` are not met 18 | - fix a SciPy deprecation warning 19 | 20 | 0.11.0 - 2022-02-08 21 | ------------------- 22 | 23 | This release brings several major API changes to the text loading, text preprocessing and text mining parts of 24 | tmtoolkit. All these features are now in a single sub-module, ``corpus``. This module contains a ``Corpus`` class which 25 | holds ``Document`` objects. All text processing and text mining operations can be performed on ``Corpus`` objects. These 26 | operations are implemented as a functional API in the ``corpus`` sub-module. 27 | 28 | It is advisable to re-install tmtoolkit in a new virtual environment following the 29 | :ref:`installation instructions `. Make sure to run ``python -m tmtoolkit setup ``, where 30 | ```` is a list of language codes like ``en,fr``. 31 | 32 | Further changes include: 33 | 34 | - added new functions for identifying and joining token collocations 35 | - added new functions for visualizing corpus summary statistics 36 | - added new function ``find_documents`` 37 | - added new text normalization functions ``normalize_unicode``, ``simplify_unicode``, ``numbers_to_magnitudes`` 38 | - added support for sentences 39 | - added support for using all SpaCy token attributes 40 | - added common ``select`` argument for many text processing/mining functions to operate only on a subset of documents 41 | - added common ``as_table`` argument for many text processing/mining functions to operate to convert the result to a 42 | (sorted) dataframe 43 | - added common ``proportions`` argument for many text processing/mining functions to convert resulting frequencies to 44 | proportions or log proportions 45 | - added common ``inplace`` argument for many text processing/mining functions to either transform a corpus in-place or 46 | return a transformed copy 47 | - added 6 new languages now supported by SpaCy (Catalan, Danish, Macedonian, Polish, Romanian, Russian) 48 | - added new function ``corpus_join_documents`` for joining documents 49 | - added option for calculating log probabilities or proportions 50 | - fixed log probability calculations for higher precision in BoW statistics and topic model evaluation functions 51 | - dependencies for text processing and text mining are now optional 52 | - added function for easier logging: ``enable_logging`` 53 | - moved all functions that operate on string or numeric sequences to ``tokenseq`` sub-module 54 | - all glob patterns now use ``EXACT`` flag 55 | - added type annotations for ``corpus``, ``tokenseq`` and ``utils`` sub-modules 56 | - updated dependencies (only SpaCy 3.2 or higher is now supported) 57 | - updated minimum Python requirements (Python 3.8 or higher) 58 | - removed datatable support 59 | 60 | 61 | 0.10.0 - 2020-08-03 62 | ------------------- 63 | 64 | This release marks a switch from NLTK to `SpaCy `_ for text preprocessing tasks. With this change, 65 | much more languages are supported (see `this list `_). It is advisable to re-install tmtoolkit 66 | in a new virtual environment following the :ref:`installation instructions `. Make sure to run 67 | ``python -m tmtoolkit setup ``, where ```` is a list of language codes like ``en,fr``. 68 | 69 | Further changes: 70 | 71 | * added support for word and document vectors via SpaCy 72 | * added built-in datasets available via ``Corpus`` class 73 | * added ``ldamodel_top_word_topics`` and ``ldamodel_top_topic_docs`` functions 74 | * added new filter functions and options for ``TMPreproc`` 75 | * made stemming function optional (only available when NLTK is installed) 76 | * run DTM generation in parallel 77 | * updated dependencies 78 | * restructured tests 79 | 80 | 81 | 0.9.0 - 2019-12-20 82 | ------------------ 83 | 84 | * added usage and API documentation 85 | * added support for Arun 2010 metric in `tm_gensim` (thx to @mcooper) 86 | * added support for `datatable package `_ 87 | * added functional API for text preprocessing 88 | * added KWIC in text preprocessing 89 | * added post-installation setup routine to download necessary data files 90 | * added built-in corpora 91 | * added `sorted_terms` and `sorted_terms_data_table` to `bow_stats` 92 | * added `glue_tokens` function 93 | * retain sparse matrices in several `bow_stats` functions such as tfidf 94 | * corpus module: loading of CSV and ZIP files, added several other new methods 95 | * faster `get_dtm` (now works in parallel) 96 | * `filter_tokens` / `filter_documents` accept multiple patterns at once 97 | * lots of (partly **breaking**) changes and speed improvements in `TMPreproc` 98 | * fixed error with `ignore_case` being ignored in `token_match` for regex and glob 99 | * integrate tox 100 | * use Numpy extras for hypothesis tests 101 | * compatibility with Python 3.6, 3.7 and 3.8 102 | 103 | 104 | 0.8.0 - 2019-02-05 105 | ------------------ 106 | 107 | * faster package and sub-module import 108 | * remove support for Python 2.7 (now only Python 3.5 and higher is supported) 109 | * use `germalemma package `_ 110 | * use importlib instead of deprecated imp 111 | * fix problem with not installing all required packages 112 | 113 | 114 | 0.7.3 - 2018-09-17 (last release to support Python 2.7) 115 | ------------------------------------------------------- 116 | 117 | * new options in `corpus` module for converting Windows linebreaks to Unix linebreaks 118 | 119 | 0.7.2 - 2018-07-23 120 | ------------------ 121 | 122 | * new option for `exclude_topics`: `return_new_topic_mapping` 123 | * fixed `issue #7 `_ (results entry about model gets overwritten) 124 | 125 | 0.7.1 - 2018-06-18 126 | ------------------ 127 | 128 | * fix stupid missing import 129 | 130 | 0.7.0 - 2018-06-18 131 | ------------------ 132 | 133 | * added sub-package `bow` with functions for DTM creation and statistics 134 | * fixed problems with evaluation and parallel calculation of gensim models (#5) 135 | * added Gensim evaluation example 136 | 137 | 0.6.3 - 2018-06-01 138 | ------------------ 139 | 140 | * made `get_vocab_and_terms` more memory-efficient 141 | * updated requirements (fixes #6) 142 | 143 | 0.6.2 - 2018-04-27 144 | ------------------ 145 | 146 | * added new function `exclude_topics` to `model_stats` 147 | 148 | 0.6.1 - 2018-04-27 149 | ------------------ 150 | 151 | * better figure title placement, grouped subplots and other improvements in `plot_eval_results` 152 | * bugfix in `model_stats` due to missing unicode literals 153 | 154 | 0.6.0 - 2018-04-25 155 | ------------------ 156 | 157 | * **API restructured: (uninstall package first when upgrading!)** 158 | * sub-package `lda_utils` is now called `topicmod` 159 | * no more `common` module in `topicmod` -> divided into `evaluate` (including evaluation metrics from former `eval_metrics`), `model_io`, `model_stats`, and `parallel` 160 | * added coherence metrics `PR #2 `_ 161 | * implemented modified coherence metric according to Mimno et al. 2011 as `metric_coherence_mimno_2011` 162 | * added wrapper function for coherence model provided by Gensim as `metric_coherence_gensim` 163 | * added evaluation metric with probability of held-out documents in cross-validation (see `metric_held_out_documents_wallach09`) 164 | * added new example for topic model coherence 165 | * updated examples 166 | 167 | 0.5.0 - 2018-02-13 168 | ------------------ 169 | 170 | * add `doc_paths` field to `Corpus` 171 | * change `plot_eval_results` to show individual metrics' results as subplots – **function signature changed!** 172 | 173 | 0.4.2 - 2018-02-06 174 | ------------------ 175 | 176 | * made greedy partitioning much more efficient (i.e. faster work distribution) 177 | * added package information variables 178 | * added this CHANGES document :) 179 | 180 | 0.4.1 - 2018-01-24 181 | ------------------ 182 | 183 | * fixed bug in `lda_utils.common.ldamodel_full_doc_topics` 184 | * added `topic_labels` for doc-topic heatmap 185 | * minor documentation fixes 186 | 187 | 0.4.0 - 2018-01-18 188 | ------------------ 189 | 190 | * improved parameter checks for `TMPreproc.filter_for_pos` 191 | * improved tests for `TMPreproc.filter_for_pos` 192 | * fixed broken test in Python 2.x 193 | * added `generate_topic_labels_from_top_words` 194 | * speed up in `top_n_from_distribution` 195 | * added relevance score calculation (Sievert et al 2014) 196 | * added functions to get most/least distinctive words 197 | * added saliency calculation 198 | * allow to define axis labels and plot title in `plot_eval_results` 199 | 200 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as 4 | part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from 5 | the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them 6 | on your computer. 7 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | tmtoolkit – examples 3 | 4 | Markus Konrad 5 | """ 6 | -------------------------------------------------------------------------------- /examples/_benchmarktools.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | timings = [] 5 | timing_labels = [] 6 | 7 | 8 | def add_timing(label): 9 | timings.append(datetime.today()) 10 | timing_labels.append(label) 11 | 12 | 13 | def print_timings(): 14 | print('timings:') 15 | t_sum = 0 16 | prev_t = None 17 | for i, (t, label) in enumerate(zip(timings, timing_labels)): 18 | if i > 0: 19 | t_delta = (t - prev_t).total_seconds() 20 | print('%s: %.2f sec' % (label, t_delta)) 21 | t_sum += t_delta 22 | 23 | prev_t = t 24 | 25 | print('total: %.2f sec' % t_sum) 26 | -------------------------------------------------------------------------------- /examples/benchmark_en_newsarticles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmarking script that loads and processes English language test corpus with Corpus in parallel. 3 | 4 | This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed an 5 | English language model for spaCy: 6 | 7 | pip install -U "tmtoolkit[recommended]" 8 | python -m tmtoolkit setup en 9 | 10 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html 11 | 12 | To benchmark whole script with `time` from command line run: 13 | 14 | PYTHONPATH=.. /usr/bin/time -v python benchmark_en_newsarticles.py [NUMBER OF WORKERS] 15 | """ 16 | 17 | import sys 18 | import logging 19 | 20 | from tmtoolkit.corpus import Corpus, doc_tokens, vocabulary, dtm, lemmatize, to_lowercase, filter_clean_tokens 21 | 22 | from examples._benchmarktools import add_timing, print_timings 23 | 24 | logging.basicConfig(level=logging.INFO) 25 | tmtoolkit_log = logging.getLogger('tmtoolkit') 26 | tmtoolkit_log.setLevel(logging.INFO) 27 | tmtoolkit_log.propagate = True 28 | 29 | if len(sys.argv) > 1: 30 | max_workers = int(sys.argv[1]) 31 | else: 32 | max_workers = 1 33 | 34 | print(f'max workers: {max_workers}') 35 | 36 | #%% 37 | 38 | add_timing('start') 39 | 40 | docs = Corpus.from_builtin_corpus('en-NewsArticles', language='en', max_workers=max_workers) 41 | print(str(docs)) 42 | 43 | #%% 44 | 45 | add_timing('load and tokenize') 46 | 47 | toks = doc_tokens(docs) 48 | add_timing('doc_tokens') 49 | 50 | toks_w_attrs = doc_tokens(docs, with_attr=True) 51 | add_timing('doc_tokens with attributes') 52 | 53 | vocab = vocabulary(docs) 54 | add_timing('vocabulary') 55 | 56 | lemmatize(docs) 57 | add_timing('lemmatize') 58 | 59 | to_lowercase(docs) 60 | add_timing('to_lowercase') 61 | 62 | filter_clean_tokens(docs) 63 | add_timing('filter_clean_tokens') 64 | 65 | dtm_ = dtm(docs) 66 | add_timing('sparse_dtm') 67 | 68 | print_timings() 69 | -------------------------------------------------------------------------------- /examples/bundestag18_tfidf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example script that loads and processes the proceedings of the 18th German Bundestag and generates a tf-idf matrix. 3 | The data is quite large, consisting of 15,733 documents with 14,355,341 tokens in total. This script shows how to 4 | handle large data efficiently by using the parallel processing power of tmtoolkit and sparse matrix calculations 5 | that use few memory. 6 | 7 | Note that it is highly advisable to run this script section by section (denoted with "#%%" or even line by line in an 8 | interactive Python interpreter in order to see the effects of each code block. 9 | 10 | The data for the debates comes from offenesparlament.de, see https://github.com/Datenschule/offenesparlament-data. 11 | 12 | This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed a 13 | German language model for spaCy: 14 | 15 | pip install -U "tmtoolkit[recommended]" 16 | python -m tmtoolkit setup de 17 | 18 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html 19 | 20 | Markus Konrad 21 | June 2019 / Feb. 2022 22 | """ 23 | 24 | import re 25 | import pickle 26 | import string 27 | import random 28 | from pprint import pprint 29 | from zipfile import ZipFile 30 | 31 | from tmtoolkit import corpus as c 32 | from tmtoolkit.corpus import visualize as cvis 33 | from tmtoolkit.tokenseq import unique_chars 34 | from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table 35 | from tmtoolkit.utils import enable_logging, pickle_data, unpickle_file 36 | import matplotlib.pyplot as plt 37 | import numpy as np 38 | import pandas as pd 39 | 40 | pd.set_option('display.width', 140) 41 | pd.set_option('display.max_columns', 100) 42 | 43 | #%% Optional: set up output log for tmtoolkit 44 | 45 | enable_logging() 46 | 47 | #%% Load the data inside the zip file 48 | 49 | print('loading data from zip file') 50 | 51 | with ZipFile('data/bt18_full.zip') as bt18zip: 52 | # there is a pickled pandas data frame inside the zipfile 53 | # extract it and load it 54 | bt18pickle = bt18zip.read('bt18_speeches_merged.pickle') 55 | bt18_data = pickle.loads(bt18pickle) 56 | 57 | # we don't need this anymore, remove it to free memory 58 | del bt18pickle, bt18zip 59 | 60 | #%% Generate document labels 61 | 62 | # format of the document labels: _ 63 | bt18_data['doc_label'] = ['%s_%s' % (str(sitzung).zfill(3), str(seq).zfill(5)) 64 | for sitzung, seq in zip(bt18_data.sitzung, bt18_data.sequence)] 65 | 66 | print('loaded data frame with %d rows:' % bt18_data.shape[0]) 67 | print(bt18_data.head()) 68 | 69 | bt18_texts = dict(zip(bt18_data.doc_label, bt18_data.text)) 70 | del bt18_data 71 | 72 | 73 | #%% Prepare raw text data preprocessing 74 | 75 | # remove some special characters 76 | 77 | corpus_chars = unique_chars(bt18_texts.values()) 78 | print('special characters in text data:') 79 | pprint(sorted(corpus_chars - set(string.printable))) 80 | 81 | keepchars = set('óéıàŁŽńôíśžê̆耺ćÖÇ₂ãüÀijܚ슟ēûçÉöáåúèäßëğîǧҫČœřïñ§°') 82 | delchars = corpus_chars - set(string.printable) - keepchars 83 | print(f'will remove characters: {delchars}') 84 | 85 | delchars_table = str.maketrans('', '', ''.join(delchars)) 86 | 87 | # we will pass this function as "raw_preproc" function 88 | def del_special_chars(t): 89 | return t.translate(delchars_table) 90 | 91 | # some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen" 92 | # correct this by applying a custom function with a regular expression (RE) to each document in the corpus 93 | pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)') 94 | 95 | # in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m` 96 | # we will pass this function as "raw_preproc" function 97 | def correct_contractions(t): 98 | return pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t) 99 | 100 | 101 | # correct hyphenation issues in the documents like "groß-zügig" 102 | # we will pass this function as "raw_preproc" function 103 | pttrn_hyphenation = re.compile(r'([a-zäöüß])-([a-zäöüß])') 104 | def correct_hyphenation(t): 105 | return pttrn_hyphenation.sub(lambda m: m.group(1) + m.group(2), t) 106 | 107 | 108 | #%% Generate a Corpus object 109 | 110 | 111 | # we use the column "doc_label" as document labels and "text" as raw text 112 | print('creating corpus object') 113 | corpus = c.Corpus(bt18_texts, language='de', max_workers=1.0, 114 | raw_preproc=[del_special_chars, correct_contractions, correct_hyphenation]) 115 | 116 | # we don't need this anymore, remove it to free memory 117 | del bt18_texts 118 | 119 | c.print_summary(corpus) 120 | 121 | #%% storing a Corpus object 122 | 123 | # at any time, we may store a Corpus object to disk via `save_corpus_to_picklefile` and later load it 124 | # via `load_corpus_from_picklefile`; this helps you to prevent long running computations again 125 | 126 | # c.save_corpus_to_picklefile(corpus, 'data/bt18_corpus.pickle') 127 | # corpus = load_corpus_from_picklefile('data/bt18_corpus.pickle') 128 | 129 | #%% Have a look at the vocabulary of the whole corpus 130 | print('vocabulary:') 131 | pprint(c.vocabulary(corpus)) 132 | 133 | print(f'\nvocabulary contains {c.vocabulary_size(corpus)} tokens') 134 | 135 | #%% Display a keywords-in-context (KWIC) table 136 | 137 | print('keywords-in-context (KWIC) table for keyword "Merkel":') 138 | print(c.kwic_table(corpus, 'Merkel')) 139 | 140 | #%% Text normalization 141 | 142 | # lemmatization 143 | c.lemmatize(corpus) 144 | 145 | # convert all tokens to lowercase and apply several "cleaning" methods 146 | print('applying further token normalization') 147 | c.to_lowercase(corpus) 148 | c.filter_clean_tokens(corpus) 149 | c.remove_tokens(corpus, r'^-.+', match_type='regex') 150 | 151 | print('vocabulary:') 152 | pprint(c.vocabulary(corpus)) 153 | 154 | print(f'\nvocabulary contains {c.vocabulary_size(corpus)} tokens') 155 | 156 | # there are still some stray tokens which should be removed: 157 | c.remove_tokens(corpus, ['+40', '+', '.plädieren']) 158 | 159 | #%% Let's have a look at the most frequent tokens 160 | 161 | print('retrieving document frequencies for all tokens in the vocabulary') 162 | c.vocabulary_counts(corpus, proportions=1, as_table='-freq').head(50) 163 | 164 | # the rank - count plot shows quite a deviation from Zipf's law, because we already applied some token normalization 165 | fig, ax = plt.subplots() 166 | cvis.plot_ranked_vocab_counts(fig, ax, corpus, zipf=True) 167 | plt.show() 168 | 169 | #%% Further token cleanup 170 | 171 | # we can remove tokens above a certain threshold of (relative or absolute) document frequency 172 | c.remove_common_tokens(corpus, df_threshold=0.8) 173 | 174 | # since we'll later use tf-idf, removing very common or very uncommon tokens may not even be necessary; however 175 | # it reduces the computation time and memory consumption of all downstream tasks 176 | 177 | #%% Document lengths (number of tokens per document) 178 | 179 | fig, ax = plt.subplots() 180 | cvis.plot_doc_lengths_hist(fig, ax, corpus) 181 | plt.show() 182 | 183 | 184 | #%% Let's have a look at very short documents 185 | 186 | docsizes = c.doc_lengths(corpus, as_table='length') 187 | 188 | # document labels of documents with lesser or equal 30 tokens 189 | doc_labels_short = docsizes.doc[docsizes.length <= 30] 190 | doc_labels_short_texts = c.doc_texts(corpus, select=doc_labels_short, collapse=' ') 191 | 192 | print(f'{len(doc_labels_short)} documents with lesser or equal 30 tokens:') 193 | for lbl, txt in doc_labels_short_texts.items(): 194 | print(lbl) 195 | pprint(txt) 196 | print('---') 197 | 198 | 199 | #%% Remove very short documents 200 | 201 | print('removing documents with lesser or equal 30 tokens') 202 | c.remove_documents_by_label(corpus, doc_labels_short.to_list()) 203 | 204 | 205 | #%% Another keywords-in-context (KWIC) table 206 | 207 | print('keywords-in-context (KWIC) table for keyword "merkel" with normalized tokens:') 208 | print(c.kwic_table(corpus, 'merkel')) 209 | 210 | #%% Create a document-term-matrix (DTM) 211 | 212 | # this creates a sparse DTM where the matrix rows correspond to the current document labels and the 213 | # matrix columns correspond to the current vocabulary 214 | # the calculations take several minutes, even when they're performed in parallel 215 | 216 | print('creating document-term-matrix (DTM)') 217 | dtm = c.dtm(corpus) 218 | 219 | print('matrix created:') 220 | print(repr(dtm)) 221 | 222 | doc_labels = np.array(c.doc_labels(corpus)) 223 | vocab = np.array(c.vocabulary(corpus)) 224 | 225 | 226 | #%% Saving / loading a DTM 227 | 228 | # again, you may store the DTM along with the document labels and vocabulary to disk to later load it again: 229 | 230 | # pickle_data((dtm, doc_labels, vocab), 'data/bt18_dtm.pickle') 231 | # dtm, doc_labels, vocab = unpickle_file('data/bt18_dtm.pickle') 232 | 233 | 234 | #%% Computing a tf-idf matrix 235 | 236 | # we can apply tf-idf to the DTM 237 | # the result will remain a sparse matrix, hence it doesn't allocate much memory 238 | 239 | print('computing a tf-idf matrix from the DTM') 240 | tfidf_mat = tfidf(dtm) 241 | print('matrix created:') 242 | print(repr(tfidf_mat)) 243 | 244 | #%% Investigating the top tokens of the tf-idf transformed matrix 245 | 246 | # this will create a data frame of the 20 most "informative" (tf-idf-wise) tokens per document 247 | top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=20) 248 | 249 | random_doc = random.choice(doc_labels) 250 | print(f'20 most "informative" (tf-idf high ranked) tokens in randomly chosen document "{random_doc}":') 251 | 252 | print(top_tokens[top_tokens.index.get_level_values(0) == random_doc]) 253 | -------------------------------------------------------------------------------- /examples/data/ap.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/ap.pickle -------------------------------------------------------------------------------- /examples/data/bt18_full.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/bt18_full.zip -------------------------------------------------------------------------------- /examples/data/bt18_sample_1000.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/bt18_sample_1000.pickle -------------------------------------------------------------------------------- /examples/data/gensim_evaluation_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/gensim_evaluation_plot.png -------------------------------------------------------------------------------- /examples/data/nips.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/examples/data/nips.pickle -------------------------------------------------------------------------------- /examples/gensim_evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example for topic modeling evaluation with gensim. 3 | 4 | Please note that this is just an example for showing how to perform topic model evaluation with Gensim. The 5 | preprocessing of the data is just done quickly and probably not the best way for the given data. 6 | 7 | This examples requires that you have installed tmtoolkit with the recommended set of packages plus Gensim and have 8 | installed a German language model for spaCy: 9 | 10 | pip install -U "tmtoolkit[recommended,gensim]" 11 | python -m tmtoolkit setup de 12 | 13 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html 14 | 15 | """ 16 | 17 | 18 | import matplotlib.pyplot as plt 19 | import gensim 20 | import pandas as pd 21 | 22 | from tmtoolkit import corpus as c 23 | from tmtoolkit.topicmod import tm_gensim 24 | from tmtoolkit.utils import pickle_data, enable_logging 25 | from tmtoolkit.topicmod.evaluate import results_by_parameter 26 | from tmtoolkit.topicmod.visualize import plot_eval_results 27 | 28 | #%% 29 | 30 | enable_logging() 31 | 32 | #%% loading data 33 | 34 | print('loading data...') 35 | bt18 = pd.read_pickle('data/bt18_sample_1000.pickle') 36 | print('loaded %d documents' % len(bt18)) 37 | doc_labels = ['%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)] 38 | 39 | #%% 40 | 41 | print('loading and tokenizing documents') 42 | # minimal pipeline 43 | bt18corp = c.Corpus(dict(zip(doc_labels, bt18.text)), language='de', load_features=[], max_workers=1.0) 44 | del bt18 45 | c.print_summary(bt18corp) 46 | 47 | print('preprocessing data...') 48 | 49 | c.stem(bt18corp) 50 | c.filter_clean_tokens(bt18corp) 51 | 52 | c.print_summary(bt18corp) 53 | 54 | #%% 55 | 56 | print('creating gensim corpus...') 57 | 58 | texts = list(c.doc_tokens(bt18corp).values()) 59 | gnsm_dict = gensim.corpora.Dictionary.from_documents(texts) 60 | gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts] 61 | 62 | del bt18corp 63 | 64 | #%% 65 | 66 | # evaluate topic models with different parameters 67 | const_params = dict(update_every=0, passes=10) 68 | ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) 69 | varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] 70 | 71 | print(f'evaluating {len(varying_params)} topic models') 72 | eval_results = tm_gensim.evaluate_topic_models((gnsm_dict, gnsm_corpus), varying_params, const_params, 73 | coherence_gensim_texts=texts) # necessary for coherence C_V metric 74 | 75 | # save the results as pickle 76 | print('saving results') 77 | pickle_data(eval_results, 'data/gensim_evaluation_results.pickle') 78 | 79 | # plot the results 80 | print('plotting evaluation results') 81 | plt.style.use('ggplot') 82 | results_by_n_topics = results_by_parameter(eval_results, 'num_topics') 83 | plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', 84 | title='Evaluation results', figsize=(8, 6)) 85 | plt.savefig('data/gensim_evaluation_plot.png') 86 | plt.show() 87 | -------------------------------------------------------------------------------- /examples/minimal_tfidf.py: -------------------------------------------------------------------------------- 1 | """ 2 | A minimal example to showcase a few features of tmtoolkit. 3 | 4 | Markus Konrad 5 | Feb. 2022 6 | """ 7 | 8 | from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm 9 | from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table 10 | 11 | 12 | # load built-in sample dataset and use 4 worker processes 13 | corp = Corpus.from_builtin_corpus('en-News100', max_workers=4) 14 | 15 | # investigate corpus as dataframe 16 | toktbl = tokens_table(corp) 17 | print(toktbl) 18 | 19 | # apply some text normalization 20 | lemmatize(corp) 21 | to_lowercase(corp) 22 | 23 | # build sparse document-token matrix (DTM) 24 | # document labels identify rows, vocabulary tokens identify columns 25 | mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True) 26 | 27 | # apply tf-idf transformation to DTM 28 | # operation is applied on sparse matrix and uses few memory 29 | tfidf_mat = tfidf(mat) 30 | 31 | # show top 5 tokens per document ranked by tf-idf 32 | top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5) 33 | print(top_tokens) 34 | -------------------------------------------------------------------------------- /examples/topicmod_ap_nips_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Topic model evaluation for AP and NIPS datasets (http://archive.ics.uci.edu/ml/datasets/Bag+of+Words). 3 | 4 | This examples requires that you have installed tmtoolkit with the "lda" package. 5 | 6 | pip install -U "tmtoolkit[lda]" 7 | 8 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html 9 | 10 | .. codeauthor:: Markus Konrad 11 | """ 12 | 13 | import os 14 | import sys 15 | 16 | import numpy as np 17 | import matplotlib.pyplot as plt 18 | 19 | from tmtoolkit.utils import unpickle_file, enable_logging 20 | from tmtoolkit.topicmod.tm_lda import evaluate_topic_models, AVAILABLE_METRICS 21 | from tmtoolkit.topicmod.evaluate import results_by_parameter 22 | from tmtoolkit.topicmod.visualize import plot_eval_results 23 | 24 | 25 | #%% 26 | 27 | if len(sys.argv) != 5: 28 | print('req. args: dataset, number of workers, eta, alpha numerator') 29 | exit(1) 30 | 31 | dataset = sys.argv[1] 32 | n_workers = int(sys.argv[2]) 33 | eta = float(sys.argv[3]) 34 | alpha_numerator = float(sys.argv[4]) 35 | 36 | print(f'dataset: {dataset}, workers: {n_workers}, beta: {eta}, alpha numerator: {alpha_numerator}') 37 | 38 | dataset_short = os.path.basename(dataset)[:-7] 39 | 40 | #%% 41 | 42 | enable_logging() 43 | 44 | #%% 45 | 46 | print('loading data...') 47 | 48 | doc_labels, vocab, dtm = unpickle_file(dataset) 49 | doc_labels = np.asarray(doc_labels) 50 | vocab = np.asarray(vocab) 51 | 52 | #%% 53 | 54 | print('running evaluations...') 55 | 56 | const_params = { 57 | 'n_iter': 1500, 58 | 'eta': eta, 59 | 'random_state': 20220105 # to make results reproducible 60 | } 61 | 62 | var_params = [{'n_topics': k, 'alpha': alpha_numerator/k} 63 | for k in list(range(20, 201, 2))] 64 | 65 | metrics = ['arun_2010', 'cao_juan_2009', 'coherence_mimno_2011'] 66 | 67 | if 'griffiths_2004' in AVAILABLE_METRICS: 68 | metrics.append('griffiths_2004') 69 | 70 | eval_results = evaluate_topic_models(dtm, 71 | varying_parameters=var_params, 72 | constant_parameters=const_params, 73 | return_models=False, 74 | metric=metrics, 75 | n_max_processes=n_workers) 76 | 77 | #%% 78 | 79 | print('plotting evaluations...') 80 | 81 | eval_by_topics = results_by_parameter(eval_results, 'n_topics') 82 | plot_eval_results(eval_by_topics, 83 | title=f'Evaluation results for {dataset_short}\nalpha={alpha_numerator}/K, beta={eta:.4}') 84 | 85 | #plt.show() 86 | plt.savefig(f'data/topicmod_evaluate_{dataset_short}_{eta:.4}.png') 87 | 88 | print('done.') 89 | -------------------------------------------------------------------------------- /examples/topicmod_lda.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example for topic modeling with LDA with focus on the new plotting functions in `tmtoolkit.corpus.visualize` and 3 | in `tmtoolkit.topicmod.visualize`. 4 | 5 | This examples requires that you have installed tmtoolkit with the recommended set of packages plus "lda" and have 6 | installed an English language model for spaCy: 7 | 8 | pip install -U "tmtoolkit[recommended,lda]" 9 | python -m tmtoolkit setup en 10 | 11 | For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html 12 | 13 | .. codeauthor:: Markus Konrad 14 | """ 15 | 16 | import os.path 17 | 18 | import matplotlib.pyplot as plt 19 | 20 | from tmtoolkit.utils import enable_logging, pickle_data, unpickle_file 21 | from tmtoolkit.corpus import Corpus, lemmatize, to_lowercase, remove_punctuation, remove_common_tokens, \ 22 | remove_uncommon_tokens, filter_clean_tokens, print_summary, remove_documents_by_length, dtm, \ 23 | corpus_retokenize, save_corpus_to_picklefile, load_corpus_from_picklefile 24 | from tmtoolkit.corpus.visualize import plot_doc_lengths_hist, plot_doc_frequencies_hist, plot_vocab_counts_hist, \ 25 | plot_ranked_vocab_counts, plot_num_sents_hist, plot_sent_lengths_hist, plot_num_sents_vs_sent_length, \ 26 | plot_token_lengths_hist 27 | from tmtoolkit.topicmod.tm_lda import evaluate_topic_models # we're using lda for topic modeling 28 | from tmtoolkit.topicmod.evaluate import results_by_parameter 29 | from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words 30 | from tmtoolkit.topicmod.visualize import plot_eval_results, plot_topic_word_ranked_prob, plot_doc_topic_ranked_prob 31 | 32 | #%% 33 | 34 | enable_logging() 35 | 36 | #%% loading the sample corpus (English news articles) 37 | 38 | corp_picklefile = 'data/topicmod_lda_corpus.pickle' 39 | 40 | if os.path.exists(corp_picklefile): 41 | docs = load_corpus_from_picklefile(corp_picklefile) 42 | else: 43 | docs = Corpus.from_builtin_corpus('en-NewsArticles', max_workers=1.0) 44 | save_corpus_to_picklefile(docs, corp_picklefile) 45 | 46 | print_summary(docs) 47 | 48 | 49 | #%% plot some corpus summary statistics 50 | 51 | # you can copy those and also do the plotting also after corpus transformations in the next cell 52 | # this shows you nicely how the transformations change the distribution of words in the corpus 53 | 54 | fig, ax = plt.subplots() 55 | plot_doc_lengths_hist(fig, ax, docs) 56 | plt.show() 57 | 58 | fig, ax = plt.subplots() 59 | plot_vocab_counts_hist(fig, ax, docs) 60 | plt.show() 61 | 62 | fig, ax = plt.subplots() 63 | plot_ranked_vocab_counts(fig, ax, docs, zipf=True) 64 | plt.show() 65 | 66 | fig, ax = plt.subplots() 67 | plot_doc_frequencies_hist(fig, ax, docs) 68 | plt.show() 69 | 70 | fig, ax = plt.subplots() 71 | plot_num_sents_hist(fig, ax, docs) 72 | plt.show() 73 | 74 | fig, ax = plt.subplots() 75 | plot_sent_lengths_hist(fig, ax, docs) 76 | plt.show() 77 | 78 | fig, ax = plt.subplots() 79 | plot_num_sents_vs_sent_length(fig, ax, docs) 80 | plt.show() 81 | 82 | fig, ax = plt.subplots() 83 | plot_token_lengths_hist(fig, ax, docs) 84 | plt.show() 85 | 86 | #%% apply preprocessing pipeline 87 | 88 | corp_preproc_picklefile = 'data/topicmod_lda_corpus_preprocessed.pickle' 89 | 90 | if os.path.exists(corp_preproc_picklefile): 91 | docs = load_corpus_from_picklefile(corp_preproc_picklefile) 92 | else: 93 | remove_punctuation(docs) 94 | corpus_retokenize(docs) 95 | lemmatize(docs) 96 | to_lowercase(docs) 97 | filter_clean_tokens(docs, remove_numbers=True) 98 | remove_common_tokens(docs, df_threshold=0.90) 99 | remove_uncommon_tokens(docs, df_threshold=0.05) 100 | remove_documents_by_length(docs, '<', 30) 101 | 102 | save_corpus_to_picklefile(docs, corp_preproc_picklefile) 103 | 104 | print_summary(docs) 105 | 106 | #%% generating the document-term matrix 107 | 108 | dtm_picklefile = 'data/topicmod_lda_dtm.pickle' 109 | 110 | if os.path.exists(dtm_picklefile): 111 | bow_mat, doc_labels, vocab = unpickle_file(dtm_picklefile) 112 | else: 113 | bow_mat, doc_labels, vocab = dtm(docs, return_doc_labels=True, return_vocab=True) 114 | pickle_data((bow_mat, doc_labels, vocab), dtm_picklefile) 115 | 116 | 117 | 118 | #%% running the evaluation 119 | 120 | eval_res_picklefile = 'data/topicmod_lda_eval_res.pickle' 121 | 122 | if os.path.exists(dtm_picklefile): 123 | eval_results = unpickle_file(eval_res_picklefile) 124 | else: 125 | const_params = { 126 | 'n_iter': 1500, 127 | 'eta': 0.3, 128 | 'random_state': 20220105 # to make results reproducible 129 | } 130 | 131 | var_params = [{'n_topics': k, 'alpha': 10.0/k} 132 | for k in list(range(20, 101, 20)) + [125, 150, 175, 200, 250, 300]] 133 | 134 | metrics = ['cao_juan_2009', 'arun_2010', 'coherence_mimno_2011', 'griffiths_2004'] 135 | 136 | eval_results = evaluate_topic_models(bow_mat, 137 | varying_parameters=var_params, 138 | constant_parameters=const_params, 139 | return_models=True, 140 | metric=metrics) 141 | 142 | pickle_data(eval_results, eval_res_picklefile) 143 | 144 | #%% plotting evaluation results 145 | 146 | eval_by_topics = results_by_parameter(eval_results, 'n_topics') 147 | plot_eval_results(eval_by_topics) 148 | 149 | plt.show() 150 | 151 | #%% selecting the model and printing the topics' most likely words 152 | 153 | selected_model = dict(eval_by_topics)[200]['model'] 154 | 155 | print_ldamodel_topic_words(selected_model.topic_word_, vocab=vocab) 156 | 157 | #%% investigating, how many "top words" sufficiently describe a topic 158 | 159 | fig, ax = plt.subplots() 160 | plot_topic_word_ranked_prob(fig, ax, selected_model.topic_word_, n=40, log_scale=False, 161 | highlight=[4, 12, 32], alpha=0.025) 162 | 163 | plt.show() 164 | 165 | # -> about 5 to 10 words aggregate most of the probability per topic 166 | 167 | #%% investigating, how many "top topics" sufficiently describe a document 168 | 169 | fig, ax = plt.subplots() 170 | plot_doc_topic_ranked_prob(fig, ax, selected_model.doc_topic_, n=40, log_scale=False, highlight=list(range(4)), 171 | alpha=0.003) 172 | 173 | plt.show() 174 | 175 | # -> about 10 to 15 topics aggregate most of the probability per document 176 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # requirements.txt 2 | # 3 | # installs dependencies from ./setup.py, and the package itself, 4 | # in editable mode for development 5 | 6 | -e .[all] 7 | 8 | # don't forget to install the language models too via: 9 | # python -m tmtoolkit setup all 10 | -------------------------------------------------------------------------------- /requirements_doc.txt: -------------------------------------------------------------------------------- 1 | # requirements_doc.txt 2 | # 3 | # installs doc dependencies from ./setup.py, and the package itself, 4 | # in editable mode for development 5 | 6 | -e .[doc] 7 | -------------------------------------------------------------------------------- /scripts/fulldata/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except these files 4 | !.gitignore 5 | !README.md 6 | 7 | -------------------------------------------------------------------------------- /scripts/fulldata/README.md: -------------------------------------------------------------------------------- 1 | This folder stores the full datasets from which the sample datasets for the built-in corpora in tmtoolkit are generated: 2 | 3 | ``` 4 | Corp_Bundestag_V2.rds 5 | Corp_Congreso_V2.rds 6 | Corp_HouseOfCommons_V2.rds 7 | Corp_TweedeKamer_V2.rds 8 | ``` 9 | 10 | Due to their size, they are not part of the repository. You can download the respective datasets from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/L4OAKN. 11 | 12 | Furthermore, this folder contains the NIPS dataset (`vocab.nips.txt` and `docword.nips.txt`) which can be obtained from http://archive.ics.uci.edu/ml/datasets/Bag+of+Words. 13 | -------------------------------------------------------------------------------- /scripts/nips_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert NIPS data from http://archive.ics.uci.edu/ml/datasets/Bag+of+Words to sparse DTM format stored as pickle file. 3 | 4 | Markus Konrad 5 | """ 6 | 7 | import numpy as np 8 | from scipy.sparse import coo_matrix 9 | from tmtoolkit.utils import pickle_data 10 | 11 | 12 | #%% 13 | 14 | with open('fulldata/vocab.nips.txt') as f: 15 | vocab = np.array([l.strip() for l in f.readlines() if l.strip()]) 16 | 17 | #%% 18 | 19 | n_docs = None 20 | n_vocab = None 21 | n_nonzero = None 22 | entries = [] 23 | row_ind = [] 24 | col_ind = [] 25 | 26 | with open('fulldata/docword.nips.txt') as f: 27 | for i, l in enumerate(f): 28 | l = l.strip() 29 | 30 | if i < 3: 31 | n = int(l) 32 | if i == 0: 33 | n_docs = n 34 | elif i == 1: 35 | n_vocab = n 36 | elif i == 2: 37 | n_nonzero = n 38 | else: 39 | j, k, n = list(map(int, l.split())) 40 | entries.append(n) 41 | row_ind.append(j-1) # convert to zero-based index 42 | col_ind.append(k-1) # convert to zero-based index 43 | 44 | 45 | assert len(vocab) == n_vocab 46 | assert len(entries) == len(row_ind) == len(col_ind) == n_nonzero 47 | 48 | dtm = coo_matrix((entries, (row_ind, col_ind)), shape=(n_docs, n_vocab), dtype='int64') 49 | 50 | doc_labels = np.fromiter((f'doc{str(i+1).zfill(4)}' for i in range(n_docs)), dtype=' 5 | """ 6 | 7 | import os 8 | from codecs import open 9 | 10 | from setuptools import setup, find_packages 11 | 12 | __title__ = 'tmtoolkit' 13 | __version__ = '0.11.2' 14 | __author__ = 'Markus Konrad' 15 | __license__ = 'Apache License 2.0' 16 | 17 | 18 | GITHUB_URL = 'https://github.com/WZBSocialScienceCenter/tmtoolkit' 19 | 20 | DEPS_BASE = ['numpy>=1.22.0', 'scipy>=1.7.0', 'globre>=0.1.5', 21 | 'pandas>=1.4.0', 'xlrd>=2.0.0', 'openpyxl>=3.0.0', 22 | 'matplotlib>=3.5.0'] 23 | 24 | DEPS_EXTRA = { 25 | 'textproc': ['spacy>=3.2.0', 'bidict>=0.21.0', 'loky>=3.0.0'], 26 | 'textproc_extra': ['PyICU>=2.8', 'nltk>=3.6.0'], 27 | 'wordclouds': ['wordcloud>=1.8.0,<1.9', 'Pillow>=9.0.0'], 28 | 'lda': ['lda>=2.0'], 29 | 'sklearn': ['scikit-learn>=1.0.0'], 30 | 'gensim': ['gensim>=4.1.0'], 31 | 'topic_modeling_eval_extra': ['gmpy2>=2.1.0'], 32 | 'test': ['pytest>=7.0.0', 'hypothesis>=6.36.0'], 33 | 'doc': ['Sphinx>=4.4.0', 'sphinx-rtd-theme>=1.0.0', 'nbsphinx>=0.8.0'], 34 | 'dev': ['coverage>=6.3', 'coverage-badge>=1.1.0', 'pytest-cov>=3.0.0', 'twine>=3.8.0', 35 | 'ipython>=8.0.0', 'jupyter>=1.0.0', 'notebook>=6.4.0', 'tox>=3.24.0', 'setuptools>=60.7.0'], 36 | } 37 | 38 | # DEPS_EXTRA['minimal'] = DEPS_BASE # doesn't work with extras_require and pip currently 39 | # see https://github.com/pypa/setuptools/issues/1139 40 | 41 | DEPS_EXTRA['recommended'] = DEPS_EXTRA['textproc'] + DEPS_EXTRA['wordclouds'] 42 | DEPS_EXTRA['all'] = [] 43 | for k, deps in DEPS_EXTRA.items(): 44 | if k not in {'recommended', 'all'}: 45 | DEPS_EXTRA['all'].extend(deps) 46 | 47 | here = os.path.abspath(os.path.dirname(__file__)) 48 | 49 | # Get the long description from the README file 50 | with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f: 51 | long_description = f.read() 52 | 53 | setup( 54 | name=__title__, 55 | version=__version__, 56 | description='Text Mining and Topic Modeling Toolkit', 57 | long_description=long_description, 58 | long_description_content_type='text/x-rst', 59 | url=GITHUB_URL, 60 | project_urls={ 61 | 'Bug Reports': GITHUB_URL + '/issues', 62 | 'Source': GITHUB_URL, 63 | }, 64 | 65 | author=__author__, 66 | author_email='markus.konrad@wzb.eu', 67 | 68 | license=__license__, 69 | 70 | classifiers=[ 71 | 'Development Status :: 4 - Beta', 72 | 'Intended Audience :: Science/Research', 73 | 'Intended Audience :: Developers', 74 | 'License :: OSI Approved :: Apache Software License', 75 | 76 | 'Operating System :: OS Independent', 77 | 'Programming Language :: Python', 78 | 'Programming Language :: Python :: 3', 79 | 'Programming Language :: Python :: 3.8', 80 | 'Programming Language :: Python :: 3.9', 81 | 'Programming Language :: Python :: 3.10', 82 | 83 | 'Topic :: Scientific/Engineering :: Information Analysis', 84 | 'Topic :: Software Development :: Libraries :: Python Modules', 85 | 'Topic :: Utilities', 86 | ], 87 | 88 | keywords='textmining textanalysis text mining analysis preprocessing topicmodeling topic modeling evaluation', 89 | 90 | packages=find_packages(exclude=['tests', 'examples']), 91 | include_package_data=True, 92 | python_requires='>=3.8', 93 | install_requires=DEPS_BASE, 94 | extras_require=DEPS_EXTRA 95 | ) 96 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | tmtoolkit – automated tests 3 | 4 | Markus Konrad 5 | """ 6 | -------------------------------------------------------------------------------- /tests/_testtools.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | from hypothesis import strategies as st 4 | from hypothesis.extra.numpy import arrays, array_shapes 5 | 6 | 7 | def strategy_2d_array(dtype, minval=0, maxval=None, **kwargs): 8 | if 'min_side' in kwargs: 9 | min_side = kwargs.pop('min_side') 10 | else: 11 | min_side = 1 12 | 13 | if 'max_side' in kwargs: 14 | max_side = kwargs.pop('max_side') 15 | else: 16 | max_side = None 17 | 18 | if dtype is int: 19 | elems = st.integers(minval, maxval, **kwargs) 20 | elif dtype is float: 21 | elems = st.floats(minval, maxval, **kwargs) 22 | elif dtype is str: 23 | elems = st.text(min_size=minval, max_size=maxval, **kwargs) 24 | else: 25 | raise ValueError('no elements strategy for dtype', dtype) 26 | 27 | return arrays(dtype, array_shapes(min_dims=2, max_dims=2, min_side=min_side, max_side=max_side), elements=elems) 28 | 29 | 30 | def strategy_dtm(): 31 | return strategy_2d_array(int, 0, 10000) 32 | 33 | 34 | def strategy_dtm_small(): 35 | return strategy_2d_array(int, 0, 10, min_side=2, max_side=6) 36 | 37 | 38 | def strategy_2d_prob_distribution(): 39 | return strategy_2d_array(float, 0, 1, allow_nan=False, allow_infinity=False) 40 | 41 | 42 | def strategy_tokens(*args, **kwargs): 43 | return st.lists(st.text(*args, **kwargs)) 44 | 45 | 46 | def strategy_lists_of_tokens(*args, **kwargs): 47 | return st.lists(st.lists(st.text(*args, **kwargs))) 48 | 49 | 50 | def strategy_texts(*args, **kwargs): 51 | return st.lists(st.text(*args, **kwargs)) 52 | 53 | 54 | def strategy_texts_printable(): 55 | return strategy_texts(string.printable) 56 | 57 | 58 | def strategy_str_str_dict(keys_args, keys_kwargs, values_args, values_kwargs): 59 | return st.dictionaries(st.text(*keys_args, **keys_kwargs), st.text(*values_args, **values_kwargs)) 60 | 61 | 62 | def strategy_str_str_dict_printable(): 63 | return st.dictionaries(st.text(string.printable), st.text(string.printable)) 64 | -------------------------------------------------------------------------------- /tests/data/.gitignore: -------------------------------------------------------------------------------- 1 | test_pickle_unpickle* 2 | -------------------------------------------------------------------------------- /tests/data/100NewsArticles.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/100NewsArticles.xlsx -------------------------------------------------------------------------------- /tests/data/3ExampleDocs.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/3ExampleDocs.xlsx -------------------------------------------------------------------------------- /tests/data/tiny_model_reuters_5_topics.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/tiny_model_reuters_5_topics.pickle -------------------------------------------------------------------------------- /tests/data/zipdata.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tests/data/zipdata.zip -------------------------------------------------------------------------------- /tests/test_corpusimport.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for importing optional tmtoolkit.corpus module. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | from importlib.util import find_spec 8 | 9 | import pytest 10 | 11 | 12 | def test_import_corpus(): 13 | if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')): 14 | with pytest.raises(RuntimeError, match='^the required package'): 15 | from tmtoolkit import corpus 16 | with pytest.raises(RuntimeError, match='^the required package'): 17 | from tmtoolkit.corpus import Corpus 18 | else: 19 | from tmtoolkit import corpus 20 | from tmtoolkit.corpus import Corpus 21 | import spacy 22 | import bidict 23 | import loky 24 | -------------------------------------------------------------------------------- /tests/test_topicmod__eval_tools.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from scipy.sparse import coo_matrix, issparse 3 | from hypothesis import given, strategies as st 4 | 5 | from ._testtools import strategy_dtm 6 | 7 | from tmtoolkit.topicmod._eval_tools import split_dtm_for_cross_validation 8 | 9 | 10 | @given( 11 | dtm=strategy_dtm(), 12 | matrix_type=st.integers(min_value=0, max_value=1), 13 | n_folds=st.integers(min_value=0, max_value=20) 14 | ) 15 | def test_split_dtm_for_cross_validation(dtm, matrix_type, n_folds): 16 | if matrix_type == 1: 17 | dtm = coo_matrix(dtm) 18 | 19 | if n_folds < 2 or n_folds > dtm.shape[0]: 20 | with pytest.raises(ValueError): 21 | next(split_dtm_for_cross_validation(dtm, n_folds)) 22 | else: 23 | n_docs, n_vocab = dtm.shape 24 | 25 | n_generated_folds = 0 26 | for fold, train_dtm, test_dtm in split_dtm_for_cross_validation(dtm, n_folds): 27 | assert 0 <= fold < n_folds 28 | 29 | if matrix_type == 1: 30 | assert issparse(train_dtm) 31 | assert issparse(test_dtm) 32 | 33 | assert train_dtm.ndim == test_dtm.ndim == 2 34 | 35 | assert train_dtm.shape[0] >= test_dtm.shape[0] 36 | assert 0 < test_dtm.shape[0] <= n_docs // n_folds 37 | assert train_dtm.shape[0] + test_dtm.shape[0] == n_docs 38 | assert train_dtm.shape[1] == test_dtm.shape[1] == n_vocab 39 | 40 | n_generated_folds += 1 41 | 42 | assert n_folds == n_generated_folds 43 | -------------------------------------------------------------------------------- /tests/test_topicmod_model_io.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import tempfile 3 | from collections import OrderedDict 4 | 5 | import pytest 6 | from hypothesis import given, strategies as st, settings 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from ._testtools import strategy_2d_prob_distribution 12 | 13 | from tmtoolkit.topicmod import model_io 14 | 15 | 16 | def test_save_load_ldamodel_pickle(): 17 | try: 18 | import lda 19 | except ImportError: 20 | pytest.skip('lda not installed') 21 | 22 | pfile = os.path.join('tests', 'data', 'test_pickle_unpickle_ldamodel.pickle') 23 | 24 | dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]]) 25 | doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])] 26 | vocab = ['word_' + str(i) for i in range(dtm.shape[1])] 27 | 28 | model = lda.LDA(2, n_iter=1) 29 | model.fit(dtm) 30 | 31 | model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels) 32 | 33 | unpickled = model_io.load_ldamodel_from_pickle(pfile) 34 | 35 | assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_) 36 | assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_) 37 | assert vocab == unpickled['vocab'] 38 | assert doc_labels == unpickled['doc_labels'] 39 | 40 | 41 | @given( 42 | topic_word=strategy_2d_prob_distribution(), 43 | top_n=st.integers(min_value=0, max_value=20) 44 | ) 45 | def test_ldamodel_top_topic_words(topic_word, top_n): 46 | topic_word = np.array(topic_word) 47 | 48 | vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])]) 49 | 50 | if top_n < 1 or top_n > topic_word.shape[1]: 51 | with pytest.raises(ValueError): 52 | model_io.ldamodel_top_topic_words(topic_word, vocab, top_n) 53 | else: 54 | top_topic_words = model_io.ldamodel_top_topic_words(topic_word, vocab, top_n) 55 | colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)]) 56 | rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(topic_word.shape[0])]) 57 | 58 | assert top_topic_words.shape == (topic_word.shape[0], top_n) 59 | assert np.array_equal(top_topic_words.index.values, rownames) 60 | assert np.array_equal(top_topic_words.columns.values, colnames) 61 | 62 | 63 | @given( 64 | topic_word=strategy_2d_prob_distribution(), 65 | top_n=st.integers(min_value=0, max_value=20) 66 | ) 67 | def test_ldamodel_top_word_topics(topic_word, top_n): 68 | topic_word = np.array(topic_word) 69 | 70 | vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])]) 71 | 72 | if top_n < 1 or top_n > topic_word.shape[0]: 73 | with pytest.raises(ValueError): 74 | model_io.ldamodel_top_word_topics(topic_word, vocab, top_n) 75 | else: 76 | top_word_topics = model_io.ldamodel_top_word_topics(topic_word, vocab, top_n) 77 | colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)]) 78 | 79 | assert top_word_topics.shape == (topic_word.shape[1], top_n) == (len(vocab), top_n) 80 | assert np.array_equal(top_word_topics.index.values, vocab) 81 | assert np.array_equal(top_word_topics.columns.values, colnames) 82 | 83 | 84 | @given( 85 | doc_topic=strategy_2d_prob_distribution(), 86 | top_n=st.integers(min_value=0, max_value=20) 87 | ) 88 | def test_ldamodel_top_doc_topics(doc_topic, top_n): 89 | doc_topic = np.array(doc_topic) 90 | 91 | doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])]) 92 | 93 | if top_n < 1 or top_n > doc_topic.shape[1]: 94 | with pytest.raises(ValueError): 95 | model_io.ldamodel_top_topic_words(doc_topic, doc_labels, top_n) 96 | else: 97 | top_doc_topics = model_io.ldamodel_top_doc_topics(doc_topic, doc_labels, top_n) 98 | colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)]) 99 | 100 | assert top_doc_topics.shape == (doc_topic.shape[0], top_n) 101 | assert np.array_equal(top_doc_topics.index.values, doc_labels) 102 | assert np.array_equal(top_doc_topics.columns.values, colnames) 103 | 104 | 105 | @given( 106 | doc_topic=strategy_2d_prob_distribution(), 107 | top_n=st.integers(min_value=0, max_value=20) 108 | ) 109 | def test_ldamodel_top_topic_docs(doc_topic, top_n): 110 | doc_topic = np.array(doc_topic) 111 | 112 | doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])]) 113 | 114 | if top_n < 1 or top_n > doc_topic.shape[0]: 115 | with pytest.raises(ValueError): 116 | model_io.ldamodel_top_topic_docs(doc_topic, doc_labels, top_n) 117 | else: 118 | top_topic_docs = model_io.ldamodel_top_topic_docs(doc_topic, doc_labels, top_n) 119 | colnames = np.array([model_io.DEFAULT_RANK_NAME_FMT.format(i1=i + 1) for i in range(top_n)]) 120 | rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(doc_topic.shape[1])]) 121 | 122 | assert top_topic_docs.shape == (doc_topic.shape[1], top_n) 123 | assert np.array_equal(top_topic_docs.index.values, rownames) 124 | assert np.array_equal(top_topic_docs.columns.values, colnames) 125 | 126 | 127 | @given(topic_word=strategy_2d_prob_distribution()) 128 | def test_ldamodel_full_topic_words(topic_word): 129 | topic_word = np.array(topic_word) 130 | 131 | vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])]) 132 | 133 | df = model_io.ldamodel_full_topic_words(topic_word, vocab) 134 | assert isinstance(df, pd.DataFrame) 135 | 136 | rownames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(topic_word.shape[0])]) 137 | assert df.columns.tolist() == ['_topic'] + list(vocab) 138 | 139 | assert np.array_equal(df.iloc[:, 0].to_numpy(), rownames) 140 | 141 | 142 | @given(doc_topic=strategy_2d_prob_distribution()) 143 | def test_ldamodel_full_doc_topics(doc_topic): 144 | doc_topic = np.array(doc_topic) 145 | 146 | doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])]) 147 | 148 | df = model_io.ldamodel_full_doc_topics(doc_topic, doc_labels) 149 | assert isinstance(df, pd.DataFrame) 150 | 151 | colnames = np.array([model_io.DEFAULT_TOPIC_NAME_FMT.format(i1=i + 1) for i in range(doc_topic.shape[1])]) 152 | assert df.columns.tolist() == ['_doc'] + list(colnames) 153 | 154 | assert np.array_equal(df.iloc[:, 0].to_numpy(), doc_labels) 155 | 156 | 157 | @given(n_docs=st.integers(min_value=0, max_value=10), 158 | n_topics=st.integers(min_value=0, max_value=10), 159 | size_vocab=st.integers(min_value=0, max_value=50), 160 | top_n_topics=st.integers(min_value=0, max_value=10), 161 | top_n_words=st.integers(min_value=0, max_value=50), 162 | create_dtm=st.booleans()) 163 | def test_save_ldamodel_summary_to_excel(n_docs, n_topics, size_vocab, top_n_topics, top_n_words, create_dtm): 164 | try: 165 | import openpyxl 166 | except ImportError: 167 | pytest.skip('openpyxl not installed') 168 | 169 | topic_word = np.random.uniform(size=n_topics * size_vocab).reshape((n_topics, size_vocab)) 170 | doc_topic = np.random.uniform(size=n_docs * n_topics).reshape((n_docs, n_topics)) 171 | doc_labels = np.array(['doc%d' % i for i in range(doc_topic.shape[0])]) 172 | vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])]) 173 | _, excelfile = tempfile.mkstemp(suffix='.xlsx') 174 | 175 | if create_dtm: 176 | dtm = np.random.randint(0, 10, size=n_docs*size_vocab).reshape(n_docs, size_vocab) 177 | else: 178 | dtm = None 179 | 180 | if top_n_words < 1 or top_n_words > topic_word.shape[1] or top_n_topics < 1 or top_n_topics > topic_word.shape[0]\ 181 | or n_docs < 1: 182 | with pytest.raises(ValueError): 183 | model_io.save_ldamodel_summary_to_excel(excelfile, topic_word, doc_topic, doc_labels, vocab, 184 | top_n_topics=top_n_topics, top_n_words=top_n_words) 185 | else: 186 | excelsheets = model_io.save_ldamodel_summary_to_excel(excelfile, topic_word, doc_topic, doc_labels, vocab, 187 | top_n_topics=top_n_topics, top_n_words=top_n_words, 188 | dtm=dtm) 189 | assert isinstance(excelsheets, OrderedDict) 190 | 191 | sheetnames = ['top_doc_topics_vals', 'top_doc_topics_labels', 'top_doc_topics_labelled_vals', 192 | 'top_topic_word_vals', 'top_topic_word_labels', 'top_topic_words_labelled_vals'] 193 | 194 | if dtm is not None: 195 | sheetnames.append('marginal_topic_distrib') 196 | 197 | assert list(excelsheets.keys()) == sheetnames 198 | 199 | for sheetn in sheetnames: 200 | assert isinstance(excelsheets[sheetn], pd.DataFrame) 201 | 202 | -------------------------------------------------------------------------------- /tests/test_topicmod_visualize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from hypothesis import given, strategies as st, settings 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | from ._testtools import strategy_2d_prob_distribution 10 | 11 | from tmtoolkit.utils import empty_chararray 12 | from tmtoolkit.topicmod import model_io, visualize 13 | 14 | 15 | def test_generate_wordclouds_for_topic_words(): 16 | try: 17 | import lda 18 | import PIL 19 | from wordcloud import WordCloud 20 | except ImportError: 21 | pytest.skip('at least one of lda, Pillow, wordcloud not installed') 22 | 23 | data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle')) 24 | model = data['model'] 25 | vocab = data['vocab'] 26 | 27 | phi = model.topic_word_ 28 | assert phi.shape == (5, len(vocab)) 29 | 30 | topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10) 31 | assert len(topic_word_clouds) == 5 32 | assert set(topic_word_clouds.keys()) == set('topic_%d' % i for i in range(1, 6)) 33 | assert all(isinstance(wc, PIL.Image.Image) for wc in topic_word_clouds.values()) 34 | 35 | topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10, 36 | which_topics=('topic_1', 'topic_2'), 37 | return_images=False, 38 | width=640, height=480) 39 | assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'} 40 | assert all(isinstance(wc, WordCloud) for wc in topic_word_clouds.values()) 41 | assert all(wc.width == 640 and wc.height == 480 for wc in topic_word_clouds.values()) 42 | 43 | 44 | def test_generate_wordclouds_for_document_topics(): 45 | try: 46 | import lda 47 | import PIL 48 | from wordcloud import WordCloud 49 | except ImportError: 50 | pytest.skip('at least one of lda, Pillow, wordcloud not installed') 51 | 52 | data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle')) 53 | model = data['model'] 54 | doc_labels = data['doc_labels'] 55 | 56 | theta = model.doc_topic_ 57 | assert theta.shape == (len(doc_labels), 5) 58 | 59 | doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3) 60 | assert len(doc_topic_clouds) == len(doc_labels) 61 | assert set(doc_topic_clouds.keys()) == set(doc_labels) 62 | assert all(isinstance(wc, PIL.Image.Image) for wc in doc_topic_clouds.values()) 63 | 64 | which_docs = doc_labels[:2] 65 | assert len(which_docs) == 2 66 | doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3, 67 | which_documents=which_docs, 68 | return_images=False, 69 | width=640, height=480) 70 | assert set(doc_topic_clouds.keys()) == set(which_docs) 71 | assert all(isinstance(wc, WordCloud) for wc in doc_topic_clouds.values()) 72 | assert all(wc.width == 640 and wc.height == 480 for wc in doc_topic_clouds.values()) 73 | 74 | 75 | def test_write_wordclouds_to_folder(tmpdir): 76 | try: 77 | import lda 78 | import PIL 79 | from wordcloud import WordCloud 80 | except ImportError: 81 | pytest.skip('at least one of lda, Pillow, wordcloud not installed') 82 | 83 | path = tmpdir.mkdir('wordclouds').dirname 84 | 85 | data = model_io.load_ldamodel_from_pickle(os.path.join('tests', 'data', 'tiny_model_reuters_5_topics.pickle')) 86 | model = data['model'] 87 | vocab = data['vocab'] 88 | 89 | phi = model.topic_word_ 90 | assert phi.shape == (5, len(vocab)) 91 | 92 | topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10) 93 | 94 | visualize.write_wordclouds_to_folder(topic_word_clouds, path, 'cloud_{label}.png') 95 | 96 | for label in topic_word_clouds.keys(): 97 | assert os.path.exists(os.path.join(path, 'cloud_{label}.png'.format(label=label))) 98 | 99 | 100 | @given( 101 | doc_topic=strategy_2d_prob_distribution(), 102 | make_topic_labels=st.booleans() 103 | ) 104 | def test_plot_doc_topic_heatmap(doc_topic, make_topic_labels): 105 | doc_topic = np.array(doc_topic) 106 | doc_labels = ['d%d' % i for i in range(doc_topic.shape[0])] 107 | 108 | if make_topic_labels and doc_topic.ndim == 2: 109 | topic_labels = ['t%d' % i for i in range(doc_topic.shape[1])] 110 | else: 111 | topic_labels = None 112 | 113 | fig, ax = plt.subplots(figsize=(8, 6)) 114 | 115 | if doc_topic.ndim != 2 or 0 in set(doc_topic.shape): 116 | with pytest.raises(ValueError): 117 | visualize.plot_doc_topic_heatmap(fig, ax, doc_topic, doc_labels=doc_labels, topic_labels=topic_labels) 118 | else: 119 | visualize.plot_doc_topic_heatmap(fig, ax, doc_topic, doc_labels=doc_labels, topic_labels=topic_labels) 120 | 121 | plt.close(fig) 122 | 123 | 124 | @given(topic_word=strategy_2d_prob_distribution()) 125 | def test_plot_topic_word_heatmap(topic_word): 126 | topic_word = np.array(topic_word) 127 | 128 | if topic_word.ndim == 2: 129 | vocab = np.array(['t%d' % i for i in range(topic_word.shape[1])]) 130 | else: 131 | vocab = empty_chararray() 132 | 133 | fig, ax = plt.subplots(figsize=(8, 6)) 134 | 135 | if topic_word.ndim != 2 or 0 in set(topic_word.shape): 136 | with pytest.raises(ValueError): 137 | visualize.plot_topic_word_heatmap(fig, ax, topic_word, vocab) 138 | else: 139 | visualize.plot_topic_word_heatmap(fig, ax, topic_word, vocab) 140 | 141 | plt.close(fig) 142 | 143 | 144 | # TODO: check how eval. results are generated and reenable this 145 | # @settings(deadline=5000) 146 | # @given(n_param_sets=st.integers(0, 10), 147 | # n_params=st.integers(1, 3), 148 | # n_metrics=st.integers(1, 3), 149 | # plot_specific_metric=st.booleans()) 150 | # def test_plot_eval_results(n_param_sets, n_params, n_metrics, plot_specific_metric): 151 | # param_names = ['param' + str(i) for i in range(n_params)] 152 | # metric_names = ['metric' + str(i) for i in range(n_metrics)] 153 | # res = [] 154 | # for _ in range(n_param_sets): 155 | # param_set = dict(zip(param_names, np.random.randint(0, 100, n_params))) 156 | # metric_results = dict(zip(metric_names, np.random.uniform(0, 1, n_metrics))) 157 | # res.append((param_set, metric_results)) 158 | # 159 | # p = random.sample(param_names, random.randint(1, len(param_names))) 160 | # by_param = evaluate.results_by_parameter(res, p) 161 | # 162 | # if not by_param: 163 | # with pytest.raises(ValueError): 164 | # visualize.plot_eval_results(by_param) 165 | # else: 166 | # if plot_specific_metric: 167 | # metric = random.choice(metric_names) 168 | # else: 169 | # metric = None 170 | # 171 | # fig, _, _ = visualize.plot_eval_results(by_param, metric=metric, param=p) 172 | # plt.close(fig) 173 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os.path 4 | import string 5 | from datetime import date 6 | 7 | import pytest 8 | import hypothesis.strategies as st 9 | from hypothesis import given 10 | import numpy as np 11 | import pandas as pd 12 | from scipy.sparse import coo_matrix, isspmatrix_csr 13 | 14 | from ._testtools import strategy_dtm_small 15 | 16 | from tmtoolkit.utils import (pickle_data, unpickle_file, flatten_list, greedy_partitioning, 17 | mat2d_window_from_indices, combine_sparse_matrices_columnwise, path_split, read_text_file, 18 | linebreaks_win2unix, split_func_args, empty_chararray, as_chararray, merge_dicts, 19 | merge_sets, sample_dict, enable_logging, set_logging_level, disable_logging, dict2df, 20 | applychain) 21 | 22 | PRINTABLE_ASCII_CHARS = [chr(c) for c in range(32, 127)] 23 | 24 | 25 | @pytest.mark.parametrize('level, fmt', [ 26 | (logging.DEBUG, '%(levelname)s:%(name)s:%(message)s'), 27 | (logging.INFO, '%(levelname)s:%(name)s:%(message)s'), 28 | (logging.WARNING, '%(levelname)s:%(name)s:%(message)s'), 29 | (logging.INFO, ''), 30 | ]) 31 | def test_enable_disable_logging(caplog, level, fmt): 32 | tmtk_logger = logging.getLogger('tmtoolkit') 33 | tmtk_logger.setLevel(logging.WARNING) # reset to default level 34 | 35 | tmtk_logger.debug('test line debug 1') 36 | tmtk_logger.info('test line info 1') 37 | assert caplog.text == '' 38 | 39 | # pytest caplog fixture uses an extra logging handler (which is already added to the logger) 40 | if fmt == '': 41 | enable_logging(level, logging_handler=caplog.handler, add_logging_handler=False) 42 | else: 43 | enable_logging(level, fmt, logging_handler=caplog.handler, add_logging_handler=False) 44 | 45 | tmtk_logger.debug('test line debug 2') 46 | if level == logging.DEBUG: 47 | assert caplog.text.endswith('DEBUG:tmtoolkit:test line debug 2\n') 48 | if fmt == '': 49 | assert caplog.text.startswith(date.today().isoformat()) 50 | else: 51 | assert caplog.text == '' 52 | 53 | caplog.clear() 54 | 55 | tmtk_logger.info('test line info 2') 56 | if level <= logging.INFO: 57 | assert caplog.text.endswith('INFO:tmtoolkit:test line info 2\n') 58 | if fmt == '': 59 | assert caplog.text.startswith(date.today().isoformat()) 60 | else: 61 | assert caplog.text == '' 62 | 63 | if level > logging.DEBUG: # reduce logging level to DEBUG 64 | caplog.clear() 65 | set_logging_level(logging.DEBUG) 66 | tmtk_logger.debug('test line debug 3') 67 | assert caplog.text.endswith('DEBUG:tmtoolkit:test line debug 3\n') 68 | if fmt == '': 69 | assert caplog.text.startswith(date.today().isoformat()) 70 | 71 | caplog.clear() 72 | disable_logging() 73 | 74 | tmtk_logger.debug('test line debug 4') 75 | tmtk_logger.info('test line info 4') 76 | 77 | assert caplog.text == '' 78 | 79 | 80 | def test_pickle_unpickle(): 81 | pfile = os.path.join('tests', 'data', 'test_pickle_unpickle.pickle') 82 | input_data = ('foo', 123, []) 83 | pickle_data(input_data, pfile) 84 | 85 | output_data = unpickle_file(pfile) 86 | 87 | for i, o in zip(input_data, output_data): 88 | assert i == o 89 | 90 | 91 | def test_path_split(): 92 | assert path_split('') == [] 93 | assert path_split('/') == [] 94 | assert path_split('a') == ['a'] 95 | assert path_split('/a') == ['a'] 96 | assert path_split('/a/') == ['a'] 97 | assert path_split('a/') == ['a'] 98 | assert path_split('a/b') == ['a', 'b'] 99 | assert path_split('a/b/c') == ['a', 'b', 'c'] 100 | assert path_split('/a/b/c') == ['a', 'b', 'c'] 101 | assert path_split('/a/b/c/') == ['a', 'b', 'c'] 102 | assert path_split('/a/../b/c/') == ['a', '..', 'b', 'c'] 103 | assert path_split('/a/b/c/d.txt') == ['a', 'b', 'c', 'd.txt'] 104 | 105 | 106 | def test_read_text_file(): 107 | fpath = os.path.join('tests', 'data', 'gutenberg', 'kafka_verwandlung.txt') 108 | contents = read_text_file(fpath, encoding='utf-8') 109 | assert len(contents) > 0 110 | contents = read_text_file(fpath, encoding='utf-8', read_size=10) 111 | assert 5 <= len(contents) <= 10 112 | contents = read_text_file(fpath, encoding='utf-8', read_size=10, force_unix_linebreaks=False) 113 | assert len(contents) == 10 114 | contents = read_text_file(fpath, encoding='utf-8', read_size=100) 115 | assert 0 < len(contents) <= 100 116 | 117 | 118 | @given(text=st.text(alphabet=list('abc \r\n'), max_size=20)) 119 | def test_linebreaks_win2unix(text): 120 | res = linebreaks_win2unix(text) 121 | assert '\r\n' not in res 122 | if '\r\n' in text: 123 | assert '\n' in res 124 | 125 | 126 | def test_empty_chararray(): 127 | res = empty_chararray() 128 | assert isinstance(res, np.ndarray) 129 | assert len(res) == 0 130 | assert res.ndim == 1 131 | assert np.issubdtype(res.dtype, 'str') 132 | 133 | 134 | @given(x=st.lists(st.integers()), 135 | as_numpy_array=st.booleans()) 136 | def test_as_chararray(x, as_numpy_array): 137 | x_orig = x 138 | if as_numpy_array: 139 | x = np.array(x) 140 | 141 | res = as_chararray(x) 142 | assert isinstance(res, np.ndarray) 143 | assert len(res) == len(x) 144 | assert res.ndim == 1 145 | assert np.issubdtype(res.dtype, 'str') 146 | assert res.tolist() == list(map(str, x_orig)) 147 | 148 | 149 | @given(data=st.dictionaries(keys=st.text(string.ascii_letters, min_size=1), values=st.integers(), max_size=10), 150 | key_name=st.text(string.ascii_letters, min_size=1), 151 | value_name=st.text(string.ascii_letters, min_size=1), 152 | sort=st.sampled_from([None, 'key', 'value']), 153 | asc=st.booleans()) 154 | def test_dict2df(data, key_name, value_name, sort, asc): 155 | if sort == 'key': 156 | sort_arg = key_name 157 | elif sort == 'value': 158 | sort_arg = value_name 159 | else: 160 | sort_arg = None 161 | 162 | if not asc and sort is not None: 163 | sort_arg = '-' + sort_arg 164 | 165 | if key_name == value_name: 166 | with pytest.raises(ValueError): 167 | dict2df(data, key_name, value_name, sort=sort_arg) 168 | else: 169 | res = dict2df(data, key_name, value_name, sort=sort_arg) 170 | assert isinstance(res, pd.DataFrame) 171 | assert len(res) == len(data) 172 | assert res.columns.tolist() == [key_name, value_name] 173 | 174 | # check key - value mapping 175 | for k, v in data.items(): 176 | cell = res.loc[res[key_name] == k, value_name].tolist() 177 | assert len(cell) == 1 178 | assert cell[0] == v 179 | 180 | # check sort 181 | if sort == 'key': 182 | assert res[key_name].tolist() == sorted(data.keys(), reverse=not asc) 183 | elif sort == 'value': 184 | assert res[value_name].tolist() == sorted(data.values(), reverse=not asc) 185 | else: 186 | assert res[key_name].tolist() == list(data.keys()) 187 | assert res[value_name].tolist() == list(data.values()) 188 | 189 | 190 | @pytest.mark.parametrize('expected, funcs, initial_arg', [ 191 | (None, [], 1), 192 | (1, [lambda x: x], 1), 193 | (1, [lambda x: -x, lambda x: -x], 1), 194 | (2.0, [lambda x: x**2, math.sqrt], 2), 195 | (8.0, [lambda x: x**2, math.sqrt, lambda x: x**3], 2), 196 | ]) 197 | def test_applychain(expected, funcs, initial_arg): 198 | if expected is None: 199 | with pytest.raises(ValueError): 200 | applychain(funcs, initial_arg) 201 | else: 202 | res = applychain(funcs, initial_arg) 203 | if isinstance(expected, float): 204 | assert math.isclose(res, expected) 205 | else: 206 | assert res == expected 207 | 208 | 209 | @given(l=st.lists(st.integers(0, 10), min_size=2, max_size=2).flatmap( 210 | lambda size: st.lists(st.lists(st.integers(), min_size=size[0], max_size=size[0]), 211 | min_size=size[1], max_size=size[1]))) 212 | def test_flatten_list(l): 213 | l_ = flatten_list(l) 214 | 215 | assert type(l_) is list 216 | assert len(l_) == sum(map(len, l)) 217 | 218 | 219 | @given( 220 | mat=strategy_dtm_small(), 221 | n_row_indices=st.integers(0, 10), 222 | n_col_indices=st.integers(0, 10), 223 | copy=st.booleans() 224 | ) 225 | def test_mat2d_window_from_indices(mat, n_row_indices, n_col_indices, copy): 226 | mat = np.array(mat) 227 | 228 | n_rows, n_cols = mat.shape 229 | 230 | if n_row_indices == 0: 231 | row_indices = None 232 | else: 233 | row_indices = np.random.choice(np.arange(n_rows), size=min(n_rows, n_row_indices), replace=False) 234 | 235 | if n_col_indices == 0: 236 | col_indices = None 237 | else: 238 | col_indices = np.random.choice(np.arange(n_cols), size=min(n_cols, n_col_indices), replace=False) 239 | 240 | window = mat2d_window_from_indices(mat, row_indices, col_indices, copy) 241 | 242 | if row_indices is None: 243 | asserted_y_shape = n_rows 244 | else: 245 | asserted_y_shape = len(row_indices) 246 | assert window.shape[0] == asserted_y_shape 247 | 248 | if col_indices is None: 249 | asserted_x_shape = n_cols 250 | else: 251 | asserted_x_shape = len(col_indices) 252 | assert window.shape[1] == asserted_x_shape 253 | 254 | if row_indices is None: 255 | row_indices_check = np.arange(n_rows) 256 | else: 257 | row_indices_check = row_indices 258 | 259 | if col_indices is None: 260 | col_indices_check = np.arange(n_cols) 261 | else: 262 | col_indices_check = col_indices 263 | 264 | for w_y, m_y in enumerate(row_indices_check): 265 | for w_x, m_x in enumerate(col_indices_check): 266 | assert window[w_y, w_x] == mat[m_y, m_x] 267 | 268 | 269 | @given(dicts=st.lists(st.dictionaries(st.text(), st.integers())), 270 | sort_keys=st.booleans(), 271 | safe=st.booleans()) 272 | def test_merge_dicts(dicts, sort_keys, safe): 273 | all_keys = set() 274 | has_common_keys = False 275 | for d in dicts: 276 | ks = set(d.keys()) 277 | if not has_common_keys and any(k in all_keys for k in ks): 278 | has_common_keys = True 279 | all_keys.update(ks) 280 | 281 | if len(dicts) > 1 and has_common_keys and safe: 282 | with pytest.raises(ValueError, match=r'^merging these containers would overwrite already existing contents'): 283 | merge_dicts(dicts, sort_keys=sort_keys, safe=safe) 284 | else: 285 | res = merge_dicts(dicts, sort_keys=sort_keys, safe=safe) 286 | assert isinstance(res, dict) 287 | n = sum(map(len, dicts)) 288 | if has_common_keys: 289 | assert len(res) <= n 290 | else: 291 | assert len(res) == n 292 | for d in dicts: 293 | for k, v in d.items(): 294 | assert res[k] == v 295 | assert set(res.keys()) == all_keys 296 | if sort_keys: 297 | assert list(res.keys()) == sorted(all_keys) 298 | 299 | @given(sets=st.lists(st.sets(st.integers())), safe=st.booleans()) 300 | def test_merge_sets(sets, safe): 301 | all_elems = set() 302 | has_common_elems = False 303 | for s in sets: 304 | if not has_common_elems and any(e in all_elems for e in s): 305 | has_common_elems = True 306 | all_elems.update(s) 307 | 308 | if len(sets) > 1 and has_common_elems and safe: 309 | with pytest.raises(ValueError, match=r'^merging these containers would overwrite already existing contents'): 310 | merge_sets(sets, safe=safe) 311 | else: 312 | res = merge_sets(sets, safe=safe) 313 | assert res == all_elems 314 | 315 | 316 | @given(d=st.dictionaries(st.text(), st.integers()), n=st.integers()) 317 | def test_sample_dict(d, n): 318 | if 0 <= n <= len(d): 319 | res = sample_dict(d, n=n) 320 | assert isinstance(res, dict) 321 | assert len(res) == n 322 | assert set(res.keys()) <= set(d.keys()) 323 | 324 | for k, v in res.items(): 325 | assert v == d[k] 326 | else: 327 | with pytest.raises(ValueError): 328 | sample_dict(d, n=n) 329 | 330 | 331 | @given(elems_dict=st.dictionaries(st.text(string.printable), st.floats(allow_nan=False, allow_infinity=False)), 332 | k=st.integers()) 333 | def test_greedy_partitioning(elems_dict, k): 334 | if k <= 0: 335 | with pytest.raises(ValueError): 336 | greedy_partitioning(elems_dict, k) 337 | else: 338 | bins = greedy_partitioning(elems_dict, k) 339 | 340 | if 1 < k <= len(elems_dict): 341 | assert k == len(bins) 342 | else: 343 | assert len(bins) == len(elems_dict) 344 | 345 | if k == 1: 346 | assert bins == elems_dict 347 | else: 348 | assert sum(len(b.keys()) for b in bins) == len(elems_dict) 349 | assert all((k in elems_dict.keys() for k in b.keys()) for b in bins) 350 | 351 | if k > len(elems_dict): 352 | assert all(len(b) == 1 for b in bins) 353 | 354 | 355 | def test_combine_sparse_matrices_columnwise(): 356 | m1 = coo_matrix(np.array([ 357 | [1, 0, 3], 358 | [0, 2, 0], 359 | ])) 360 | 361 | cols1 = list('CAD') 362 | rows1 = [4, 0] # row labels. can be integers! 363 | 364 | m2 = coo_matrix(np.array([ 365 | [0, 0, 1, 2], 366 | [3, 4, 5, 6], 367 | [2, 1, 0, 0], 368 | ])) 369 | 370 | cols2 = list('DBCA') 371 | rows2 = [3, 1, 2] 372 | 373 | m3 = coo_matrix(np.array([ 374 | [9, 8], 375 | ])) 376 | 377 | cols3 = list('BD') 378 | 379 | m4 = coo_matrix(np.array([ 380 | [9], 381 | [8] 382 | ])) 383 | 384 | cols4 = list('A') 385 | 386 | m5 = coo_matrix((0, 0), dtype=int) 387 | 388 | cols5 = [] 389 | 390 | expected_1_2 = np.array([ 391 | [0, 0, 1, 3], 392 | [2, 0, 0, 0], 393 | [2, 0, 1, 0], 394 | [6, 4, 5, 3], 395 | [0, 1, 0, 2], 396 | ]) 397 | 398 | expected_1_5 = np.array([ 399 | [0, 0, 1, 3], 400 | [2, 0, 0, 0], 401 | [2, 0, 1, 0], 402 | [6, 4, 5, 3], 403 | [0, 1, 0, 2], 404 | [0, 9, 0, 8], # 3 405 | [9, 0, 0, 0], # 4 406 | [8, 0, 0, 0], # 4 407 | ]) 408 | 409 | expected_1_2_rows_sorted = np.array([ 410 | [2, 0, 0, 0], 411 | [6, 4, 5, 3], 412 | [0, 1, 0, 2], 413 | [2, 0, 1, 0], 414 | [0, 0, 1, 3], 415 | ]) 416 | 417 | with pytest.raises(ValueError): 418 | combine_sparse_matrices_columnwise([], []) 419 | 420 | with pytest.raises(ValueError): 421 | combine_sparse_matrices_columnwise((m1, m2), (cols1, )) 422 | 423 | with pytest.raises(ValueError): 424 | combine_sparse_matrices_columnwise((m1, m2), (cols1, list('X'))) 425 | 426 | with pytest.raises(ValueError): 427 | combine_sparse_matrices_columnwise((m2, ), (cols1, cols2)) 428 | 429 | with pytest.raises(ValueError): 430 | combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), []) 431 | 432 | with pytest.raises(ValueError): 433 | combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, rows1)) 434 | 435 | with pytest.raises(ValueError): 436 | combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, [0, 0, 0, 0])) 437 | 438 | # matrices 1 and 2, no row re-ordering 439 | res, res_cols = combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2)) 440 | 441 | assert isspmatrix_csr(res) 442 | assert res.shape == (5, 4) 443 | assert np.all(res.A == expected_1_2) 444 | assert np.array_equal(res_cols, np.array(list('ABCD'))) 445 | 446 | # matrices 1 and 2, re-order rows 447 | res, res_cols, res_rows = combine_sparse_matrices_columnwise((m1, m2), (cols1, cols2), (rows1, rows2)) 448 | assert isspmatrix_csr(res) 449 | assert res.shape == (5, 4) 450 | assert np.all(res.A == expected_1_2_rows_sorted) 451 | assert np.array_equal(res_cols, np.array(list('ABCD'))) 452 | assert np.array_equal(res_rows, np.arange(5)) 453 | 454 | # matrices 1 to 5, no row re-ordering 455 | res, res_cols = combine_sparse_matrices_columnwise((m1, m2, m3, m4, m5), (cols1, cols2, cols3, cols4, cols5)) 456 | 457 | assert isspmatrix_csr(res) 458 | assert np.all(res.A == expected_1_5) 459 | assert np.array_equal(res_cols, np.array(list('ABCD'))) 460 | 461 | 462 | @pytest.mark.parametrize('testfn, testargs, expargs1, expargs2', [ 463 | (lambda x, y: ..., {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2}, {'z': 3}), 464 | (lambda: ..., {'x': 1, 'y': 2, 'z': 3}, {}, {'x': 1, 'y': 2, 'z': 3}), 465 | (lambda x, y, z: ..., {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2, 'z': 3}, {}), 466 | ]) 467 | def test_split_func_args(testfn, testargs, expargs1, expargs2): 468 | res = split_func_args(testfn, testargs) 469 | assert isinstance(res, tuple) and len(res) == 2 470 | args1, args2 = res 471 | assert args1 == expargs1 472 | assert args2 == expargs2 473 | -------------------------------------------------------------------------------- /tmtoolkit/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | tmtoolkit – Text Mining and Topic Modeling Toolkit for Python 3 | 4 | Markus Konrad 5 | """ 6 | 7 | from importlib.util import find_spec 8 | import logging 9 | 10 | __title__ = 'tmtoolkit' 11 | __version__ = '0.11.2' 12 | __author__ = 'Markus Konrad' 13 | __license__ = 'Apache License 2.0' 14 | 15 | logger = logging.getLogger(__title__) 16 | logger.addHandler(logging.NullHandler()) 17 | logger.setLevel(logging.WARNING) # set default level 18 | 19 | 20 | from . import bow, topicmod, tokenseq, types, utils 21 | 22 | if not any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')): 23 | from . import corpus 24 | -------------------------------------------------------------------------------- /tmtoolkit/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | tmtoolkit – Text Mining and Topic Modeling Toolkit for Python 3 | 4 | CLI module 5 | 6 | Markus Konrad 7 | """ 8 | 9 | HELP_TEXT = """tmtoolkit installation setup 10 | 11 | Run 12 | 13 | python -m tmtoolkit setup 14 | 15 | to install all necessary language models for languages listed as 16 | comma-separated language ISO codes in . The list must 17 | be specified without spaces. Example: 18 | 19 | python -m tmtoolkit setup en,de,ru 20 | 21 | This will install language models for English, German and Russian. 22 | To install all available language models, run: 23 | 24 | python -m tmtoolkit setup all 25 | 26 | You can pass two additional arguments: 27 | 28 | --variants=... sets the model size variants to install; default is --variants=sm,md 29 | --no-update if this argument is passed, only models that are not installed so far will be installed 30 | """ 31 | 32 | if __name__ == '__main__': 33 | import sys 34 | import subprocess 35 | import json 36 | 37 | try: 38 | from tmtoolkit.corpus import DEFAULT_LANGUAGE_MODELS 39 | except ImportError: 40 | print('error: tmtoolkit is not installed with the dependencies required for text processing; ' 41 | 'install tmtoolkit with the [recommended] or [textproc] option', file=sys.stderr) 42 | exit(1) 43 | 44 | def _setup(args): 45 | from spacy.cli.download import download 46 | 47 | variants_switch = '--variants=' 48 | i_variants_arg = None 49 | for i, arg in enumerate(args): 50 | if arg.startswith(variants_switch): 51 | i_variants_arg = i 52 | break 53 | 54 | if i_variants_arg is not None: 55 | vararg = args.pop(i_variants_arg) 56 | variants = vararg[len(variants_switch):].split(',') 57 | else: 58 | variants = ['sm', 'md'] 59 | 60 | try: 61 | args.remove('--no-update') 62 | no_update = True 63 | except ValueError: 64 | no_update = False 65 | 66 | if not args: 67 | print('error: you must pass a list of two-letter ISO 639-1 language codes to install the respective ' 68 | 'language models or the string "all" to install all available language models', file=sys.stderr) 69 | exit(3) 70 | 71 | if args == ['all']: 72 | install_languages = list(DEFAULT_LANGUAGE_MODELS.keys()) 73 | else: 74 | install_languages = [] 75 | for arg in args: 76 | install_languages.extend([l for l in map(str.strip, arg.split(',')) if l]) 77 | 78 | print('checking if required spaCy data packages are installed...') 79 | 80 | try: 81 | piplist_str = subprocess.check_output([sys.executable, '-m', 'pip', 'list', 82 | '--disable-pip-version-check', 83 | '--format', 'json']) 84 | except subprocess.CalledProcessError as exc: 85 | print('error: calling pip failed with the following error message\n' + str(exc), file=sys.stderr) 86 | exit(4) 87 | 88 | piplist = json.loads(piplist_str) 89 | installed_pkgs = set(item['name'] for item in piplist) 90 | 91 | for modelvar in variants: 92 | model_pkgs = dict(zip(DEFAULT_LANGUAGE_MODELS.keys(), 93 | map(lambda x: x.replace('_', '-') + '-' + modelvar, 94 | DEFAULT_LANGUAGE_MODELS.values()))) 95 | 96 | for lang in install_languages: 97 | if lang not in DEFAULT_LANGUAGE_MODELS.keys(): 98 | print(f'error: no language model for language code "{lang}"', file=sys.stderr) 99 | exit(5) 100 | 101 | lang_model_pkg = model_pkgs[lang] 102 | 103 | if no_update and lang_model_pkg in installed_pkgs: 104 | print(f'language model package "{lang_model_pkg}" for language code "{lang}" is already installed ' 105 | f'-- skipping') 106 | continue 107 | 108 | lang_model = DEFAULT_LANGUAGE_MODELS[lang] + '_' + modelvar 109 | print(f'installing language model "{lang_model}" for language code "{lang}"...') 110 | download(lang_model) 111 | 112 | print('done.') 113 | 114 | def _help(args): 115 | print(HELP_TEXT) 116 | 117 | commands = { 118 | 'setup': _setup, 119 | 'help': _help, 120 | } 121 | 122 | if len(sys.argv) <= 1: 123 | print('available commands: ' + ', '.join(commands.keys())) 124 | print('run `python -m tmtoolkit help` for help') 125 | exit(6) 126 | 127 | cmd = sys.argv[1] 128 | if cmd in commands.keys(): 129 | commands[cmd](sys.argv[2:]) 130 | else: 131 | print('command not supported:', cmd, file=sys.stderr) 132 | exit(7) 133 | -------------------------------------------------------------------------------- /tmtoolkit/bow/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bag-of-Words (BoW) sub-package with modules for generating document-term-matrices (DTMs) and some common statistics for 3 | the BoW model. 4 | 5 | .. codeauthor:: Markus Konrad 6 | """ 7 | 8 | from . import bow_stats, dtm 9 | -------------------------------------------------------------------------------- /tmtoolkit/bow/dtm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for creating a document-term matrix (DTM) and some compatibility functions for Gensim. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | import numpy as np 8 | from scipy.sparse import coo_matrix, issparse 9 | 10 | import pandas as pd 11 | 12 | 13 | #%% DTM creation 14 | 15 | def create_sparse_dtm(vocab, docs, n_unique_tokens, vocab_is_sorted=False, dtype=None): 16 | """ 17 | Create a sparse document-term-matrix (DTM) as matrix in 18 | `COO sparse format `_ 19 | from vocabulary array `vocab`, a list of tokenized documents `docs` and the number of unique tokens across all 20 | documents `n_unique_tokens`. 21 | 22 | The DTM's rows are document names, its columns are indices in `vocab`, hence a value ``DTM[j, k]`` is the 23 | term frequency of term ``vocab[k]`` in document ``j``. 24 | 25 | A note on performance: Creating the three arrays for a COO matrix seems to be the fastest way to generate a DTM. 26 | An alternative implementation using LIL format was ~2x slower. 27 | 28 | Memory requirement: about ``3 * * 4`` bytes with default dtype (32-bit integer). 29 | 30 | .. seealso:: This is the "low level" function. For the straight-forward to use function see 31 | :func:`tmtoolkit.corpus.dtm`, which also calculates `n_unique_tokens`. 32 | 33 | :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm` 34 | :param docs: a list of tokenized documents 35 | :param n_unique_tokens: number of unique tokens across all documents 36 | :param vocab_is_sorted: if True, assume that `vocab` is sorted when creating the token IDs 37 | :param dtype: data type of the resulting matrix 38 | :return: a sparse document-term-matrix in COO sparse format 39 | """ 40 | 41 | if vocab_is_sorted: 42 | vocab_sorter = None 43 | else: 44 | vocab_sorter = np.argsort(vocab) # indices that sort 45 | 46 | nvocab = len(vocab) 47 | ndocs = len(docs) 48 | 49 | # create arrays for sparse matrix 50 | dtype = dtype or 'int32' 51 | data = np.empty(n_unique_tokens, dtype=dtype) # all non-zero term frequencies at data[k] 52 | cols = np.empty(n_unique_tokens, dtype=dtype) # column index for kth data item (kth term freq.) 53 | rows = np.empty(n_unique_tokens, dtype=dtype) # row index for kth data item (kth term freq.) 54 | 55 | ind = 0 # current index in the sparse matrix data 56 | # go through all documents with their terms 57 | for doc_idx, terms in enumerate(docs): 58 | if len(terms) == 0: continue # skip empty documents 59 | 60 | # find indices into `vocab` such that, if the corresponding elements in `terms` were 61 | # inserted before the indices, the order of `vocab` would be preserved 62 | # -> array of indices of `terms` in `vocab` 63 | if vocab_is_sorted: 64 | term_indices = np.searchsorted(vocab, terms) 65 | else: 66 | term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)] 67 | 68 | # count the unique terms of the document and get their vocabulary indices 69 | uniq_indices, counts = np.unique(term_indices, return_counts=True) 70 | n_vals = len(uniq_indices) 71 | ind_end = ind + n_vals 72 | 73 | data[ind:ind_end] = counts # save the counts (term frequencies) 74 | cols[ind:ind_end] = uniq_indices # save the column index: index in 75 | rows[ind:ind_end] = np.repeat(doc_idx, n_vals) # save it as repeated value 76 | 77 | ind = ind_end 78 | 79 | assert ind == len(data) 80 | 81 | return coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=dtype) 82 | 83 | 84 | def dtm_to_dataframe(dtm, doc_labels, vocab): 85 | """ 86 | Convert a (sparse) DTM to a pandas DataFrame using document labels `doc_labels` as row index and `vocab` as column 87 | names. 88 | 89 | :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts 90 | :param doc_labels: document labels used as row index (row names); size must equal number of rows in `dtm` 91 | :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm` 92 | :return: pandas DataFrame 93 | """ 94 | if dtm.ndim != 2: 95 | raise ValueError('`dtm` must be a 2D array/matrix') 96 | 97 | if dtm.shape[0] != len(doc_labels): 98 | raise ValueError('number of rows must be equal to `len(doc_labels)') 99 | 100 | if dtm.shape[1] != len(vocab): 101 | raise ValueError('number of rows must be equal to `len(vocab)') 102 | 103 | if not isinstance(dtm, np.ndarray): 104 | dtm = dtm.toarray() 105 | 106 | return pd.DataFrame(dtm, index=doc_labels, columns=vocab) 107 | 108 | 109 | #%% Gensim compatibility functions 110 | 111 | 112 | def dtm_to_gensim_corpus(dtm): 113 | """ 114 | Convert a (sparse) DTM to a Gensim Corpus object. 115 | 116 | .. seealso:: :func:`~tmtoolkit.bow.dtm.gensim_corpus_to_dtm` for the inverse function or 117 | :func:`~tmtoolkit.bow.dtm.dtm_and_vocab_to_gensim_corpus_and_dict` which additionally creates a Gensim 118 | :class:`~gensim.corpora.dictionary.Dictionary`. 119 | 120 | :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts 121 | :return: a Gensim :class:`gensim.matutils.Sparse2Corpus` object 122 | """ 123 | import gensim 124 | 125 | # DTM with documents to words sparse matrix in COO format has to be converted to transposed sparse matrix in CSC 126 | # format 127 | dtm_t = dtm.transpose() 128 | 129 | if issparse(dtm_t): 130 | if dtm_t.format != 'csc': 131 | dtm_sparse = dtm_t.tocsc() 132 | else: 133 | dtm_sparse = dtm_t 134 | else: 135 | from scipy.sparse import csc_matrix 136 | dtm_sparse = csc_matrix(dtm_t) 137 | 138 | return gensim.matutils.Sparse2Corpus(dtm_sparse) 139 | 140 | 141 | def gensim_corpus_to_dtm(corpus): 142 | """ 143 | Convert a Gensim corpus object to a sparse DTM in COO format. 144 | 145 | .. seealso:: :func:`~tmtoolkit.bow.dtm.dtm_to_gensim_corpus` for the inverse function. 146 | 147 | :param corpus: Gensim corpus object 148 | :return: sparse DTM in COO format 149 | """ 150 | import gensim 151 | from scipy.sparse import coo_matrix 152 | 153 | dtm_t = gensim.matutils.corpus2csc(corpus) 154 | return coo_matrix(dtm_t.transpose()) 155 | 156 | 157 | def dtm_and_vocab_to_gensim_corpus_and_dict(dtm, vocab, as_gensim_dictionary=True): 158 | """ 159 | Convert a (sparse) DTM *and* a vocabulary list to a Gensim Corpus object and 160 | Gensim :class:`~gensim.corpora.dictionary.Dictionary` object or a Python :func:`dict`. 161 | 162 | :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts 163 | :param vocab: list or array of vocabulary 164 | :param as_gensim_dictionary: if True create Gensim :class:`~gensim.corpora.dictionary.Dictionary` from `vocab`, 165 | else create Python :func:`dict` 166 | :return: a 2-tuple with (Corpus object, Gensim :class:`~gensim.corpora.dictionary.Dictionary` or 167 | Python :func:`dict`) 168 | """ 169 | corpus = dtm_to_gensim_corpus(dtm) 170 | 171 | # vocabulary array has to be converted to dict with index -> word mapping 172 | id2word = dict(zip(range(len(vocab)), vocab)) 173 | 174 | if as_gensim_dictionary: 175 | import gensim 176 | return corpus, gensim.corpora.dictionary.Dictionary().from_corpus(corpus, id2word) 177 | else: 178 | return corpus, id2word 179 | -------------------------------------------------------------------------------- /tmtoolkit/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for processing text as token sequences in labelled documents. A set of documents is represented as *corpus* 3 | using the :class:`Corpus` class. This sub-package also provides functions that work with a :class:`Corpus` object. 4 | 5 | Text parsing and processing relies on the `SpaCy library `_ which must be installed when using this 6 | sub-package. 7 | 8 | .. codeauthor:: Markus Konrad 9 | """ 10 | 11 | from importlib.util import find_spec 12 | 13 | for pkg in ('spacy', 'bidict', 'loky'): 14 | if find_spec(pkg) is None: 15 | raise RuntimeError(f'the required package "{pkg}" for text processing is not installed; did you install ' 16 | f'tmtoolkit with "recommended" or "textproc" option? see ' 17 | f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information') 18 | 19 | from ..tokenseq import strip_tags, numbertoken_to_magnitude, simplify_unicode_chars 20 | 21 | from ._common import DEFAULT_LANGUAGE_MODELS, LANGUAGE_LABELS, simplified_pos 22 | from ._document import Document, document_token_attr, document_from_attrs 23 | from ._corpus import Corpus 24 | 25 | from ._corpusfuncs import ( 26 | doc_tokens, set_token_attr, set_document_attr, vocabulary, dtm, doc_texts, doc_labels, doc_lengths, 27 | corpus_num_tokens, vocabulary_size, tokens_table, print_summary, vocabulary_counts, 28 | doc_frequencies, doc_vectors, token_vectors, ngrams, to_lowercase, to_uppercase, remove_chars, 29 | serialize_corpus, deserialize_corpus, save_corpus_to_picklefile, load_corpus_from_picklefile, 30 | load_corpus_from_tokens, load_corpus_from_tokens_table, spacydocs, 31 | lemmatize, remove_punctuation, normalize_unicode, simplify_unicode, doc_token_lengths, filter_clean_tokens, 32 | corpus_ngramify, filter_tokens_by_mask, remove_tokens_by_mask, filter_tokens, remove_tokens, 33 | filter_documents, remove_documents, filter_documents_by_mask, remove_documents_by_mask, 34 | filter_documents_by_docattr, remove_documents_by_docattr, kwic, kwic_table, transform_tokens, 35 | corpus_summary, corpus_num_chars, filter_tokens_with_kwic, filter_documents_by_label, 36 | remove_documents_by_label, filter_for_pos, filter_tokens_by_doc_frequency, remove_common_tokens, 37 | remove_uncommon_tokens, filter_documents_by_length, remove_documents_by_length, 38 | join_collocations_by_patterns, join_collocations_by_statistic, corpus_tokens_flattened, corpus_collocations, 39 | remove_token_attr, remove_document_attr, builtin_corpora_info, corpus_add_files, corpus_add_folder, 40 | corpus_add_tabular, corpus_add_zip, corpus_sample, corpus_split_by_token, doc_num_sents, doc_sent_lengths, 41 | numbers_to_magnitudes, corpus_split_by_paragraph, doc_labels_sample, corpus_retokenize, corpus_unique_chars, 42 | corpus_join_documents, find_documents 43 | ) 44 | 45 | if find_spec('nltk') is not None: # when NLTK is installed 46 | from ._nltk_extras import stem 47 | 48 | from . import visualize 49 | -------------------------------------------------------------------------------- /tmtoolkit/corpus/_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Internal module with common functions and constants for text processing in the :mod:`tmtoolkit.corpus` module. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | import os 8 | from typing import Tuple, Dict 9 | 10 | MODULE_PATH = os.path.dirname(os.path.abspath(__file__)) 11 | DATAPATH = os.path.normpath(os.path.join(MODULE_PATH, '..', 'data')) 12 | 13 | #: Default SpaCy language models used for a given two-letter ISO 639-1 language code. 14 | #: These model names will be appended with model size suffix like "_sm", "_md" or "_lg". 15 | DEFAULT_LANGUAGE_MODELS = { 16 | 'en': 'en_core_web', 17 | 'de': 'de_core_news', 18 | 'fr': 'fr_core_news', 19 | 'es': 'es_core_news', 20 | 'pt': 'pt_core_news', 21 | 'it': 'it_core_news', 22 | 'nl': 'nl_core_news', 23 | 'el': 'el_core_news', 24 | 'nb': 'nb_core_news', 25 | 'lt': 'lt_core_news', 26 | 'zh': 'zh_core_web', 27 | 'ja': 'ja_core_news', 28 | 'ca': 'ca_core_news', 29 | 'da': 'da_core_news', 30 | 'mk': 'mk_core_news', 31 | 'pl': 'pl_core_news', 32 | 'ro': 'ro_core_news', 33 | 'ru': 'ru_core_news', 34 | } 35 | 36 | #: Map two-letter ISO 639-1 language code to language name. 37 | LANGUAGE_LABELS = { 38 | 'en': 'english', 39 | 'de': 'german', 40 | 'fr': 'french', 41 | 'es': 'spanish', 42 | 'pt': 'portuguese', 43 | 'it': 'italian', 44 | 'nl': 'dutch', 45 | 'el': 'greek', 46 | 'nb': 'norwegian-bokmal', 47 | 'lt': 'lithuanian', 48 | 'zh': 'chinese', 49 | 'ja': 'japanese', 50 | 'ca': 'catalan', 51 | 'da': 'danish', 52 | 'mk': 'macedonian', 53 | 'pl': 'polish', 54 | 'ro': 'romanian', 55 | 'ru': 'russian', 56 | } 57 | 58 | BOOLEAN_SPACY_TOKEN_ATTRS = ( 59 | 'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title', 60 | 'is_punct', 'is_left_punct', 'is_right_punct', 'is_space', 'is_bracket', 61 | 'is_quote', 'is_currency', 'is_stop', 'like_url', 'like_num', 'like_email', 62 | ) 63 | 64 | # SpaCy token attributes per pipeline component 65 | SPACY_TOKEN_ATTRS = { # type: Dict[str, Tuple[str]] 66 | '_default': BOOLEAN_SPACY_TOKEN_ATTRS + ('shape', 'sentiment', 'rank', 'cluster'), # always enabled 67 | 'tagger': ('tag', 'pos'), 68 | 'morphologizer': ('pos', ), 69 | 'parser': ('dep', ), 70 | 'lemmatizer': ('lemma', ), 71 | 'ner': ('ent_type', 'ent_iob'), 72 | } 73 | 74 | STD_TOKEN_ATTRS = {'is_punct', 'is_stop', 'like_num', 'tag', 'pos', 'lemma', 'ent_type'} 75 | 76 | # all token attributes that can be encoded in a uint64 matrix 77 | TOKENMAT_ATTRS = set([a for attrs in SPACY_TOKEN_ATTRS.values() for a in attrs]) \ 78 | | {'whitespace', 'token', 'sent_start'} 79 | 80 | 81 | def simplified_pos(pos: str, tagset: str = 'ud', default: str = '') -> str: 82 | """ 83 | Return a simplified POS tag for a full POS tag `pos` belonging to a tagset `tagset`. 84 | 85 | Does the following conversion by default: 86 | 87 | - all N... (noun) tags to 'N' 88 | - all V... (verb) tags to 'V' 89 | - all ADJ... (adjective) tags to 'ADJ' 90 | - all ADV... (adverb) tags to 'ADV' 91 | - all other to `default` 92 | 93 | Does the following conversion by with ``tagset=='penn'``: 94 | 95 | - all N... (noun) tags to 'N' 96 | - all V... (verb) tags to 'V' 97 | - all JJ... (adjective) tags to 'ADJ' 98 | - all RB... (adverb) tags to 'ADV' 99 | - all other to `default` 100 | 101 | Does the following conversion by with ``tagset=='ud'``: 102 | 103 | - all N... (noun) tags to 'N' 104 | - all V... (verb) tags to 'V' 105 | - all JJ... (adjective) tags to 'ADJ' 106 | - all RB... (adverb) tags to 'ADV' 107 | - all other to `default` 108 | 109 | :param pos: a POS tag as string 110 | :param tagset: tagset used for `pos`; can be ``'wn'`` (WordNet), ``'penn'`` (Penn tagset) 111 | or ``'ud'`` (universal dependencies – default) 112 | :param default: default return value when tag could not be simplified 113 | :return: simplified tag string 114 | """ 115 | 116 | if pos and not isinstance(pos, str): 117 | raise ValueError('`pos` must be a string or None') 118 | 119 | if tagset == 'ud': 120 | if pos in ('NOUN', 'PROPN'): 121 | return 'N' 122 | elif pos == 'VERB': 123 | return 'V' 124 | elif pos in ('ADJ', 'ADV'): 125 | return pos 126 | else: 127 | return default 128 | elif tagset == 'penn': 129 | if pos.startswith('N') or pos.startswith('V'): 130 | return pos[0] 131 | elif pos.startswith('JJ'): 132 | return 'ADJ' 133 | elif pos.startswith('RB'): 134 | return 'ADV' 135 | else: 136 | return default 137 | elif tagset == 'wn': 138 | if pos.startswith('N') or pos.startswith('V'): 139 | return pos[0] 140 | elif pos.startswith('ADJ') or pos.startswith('ADV'): 141 | return pos[:3] 142 | else: 143 | return default 144 | else: 145 | raise ValueError('unknown tagset "%s"' % tagset) 146 | -------------------------------------------------------------------------------- /tmtoolkit/corpus/_nltk_extras.py: -------------------------------------------------------------------------------- 1 | """ 2 | Internal module with some additional functions that are only available when the `NLTK `_ package 3 | is installed. 4 | 5 | .. codeauthor:: Markus Konrad 6 | """ 7 | from typing import Optional 8 | 9 | from ._corpus import Corpus 10 | from ._common import LANGUAGE_LABELS 11 | from ._corpusfuncs import transform_tokens 12 | 13 | 14 | def stem(docs: Corpus, /, language: Optional[str] = None, 15 | stemmer_instance: Optional[object] = None, inplace=True): 16 | """ 17 | Apply stemming to all tokens in `docs` using a stemmer `stemmer_instance`. 18 | 19 | .. note: This requires that the `NLTK `_ package is installed. 20 | 21 | :param docs: a Corpus object 22 | :param language: language in which `docs` is given; if None, will be detected from the ``language`` property of 23 | `docs`; note that this is not an ISO language code but a language 24 | label like "english" or "german" that NLTK accepts 25 | :param stemmer_instance: a stemmer instance; it must implement a method `stem` that accepts a single string; 26 | default is :class:`nltk.stem.SnowballStemmer` 27 | :param inplace: if True, modify Corpus object in place, otherwise return a modified copy 28 | :return: either original Corpus object `docs` or a modified copy of it 29 | """ 30 | 31 | from nltk.stem import SnowballStemmer 32 | 33 | if stemmer_instance is None: 34 | if language is None: 35 | language = LANGUAGE_LABELS[docs.language] 36 | stemmer_instance = SnowballStemmer(language) 37 | 38 | return transform_tokens(docs, stemmer_instance.stem, inplace=inplace) 39 | -------------------------------------------------------------------------------- /tmtoolkit/data/de/parlspeech-v2-sample-bundestag.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/de/parlspeech-v2-sample-bundestag.zip -------------------------------------------------------------------------------- /tmtoolkit/data/en/News100.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/News100.zip -------------------------------------------------------------------------------- /tmtoolkit/data/en/NewsArticles.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/NewsArticles.zip -------------------------------------------------------------------------------- /tmtoolkit/data/en/parlspeech-v2-sample-houseofcommons.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/en/parlspeech-v2-sample-houseofcommons.zip -------------------------------------------------------------------------------- /tmtoolkit/data/es/parlspeech-v2-sample-congreso.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/es/parlspeech-v2-sample-congreso.zip -------------------------------------------------------------------------------- /tmtoolkit/data/nl/parlspeech-v2-sample-tweedekamer.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WZBSocialScienceCenter/tmtoolkit/02990865ee896625d5cf540bf2b0dbc159bedf38/tmtoolkit/data/nl/parlspeech-v2-sample-tweedekamer.zip -------------------------------------------------------------------------------- /tmtoolkit/topicmod/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Topic modeling sub-package with modules for model evaluation, model I/O, model statistics, parallel computation and 3 | visualization. 4 | 5 | Functions and classes in :mod:`~tmtoolkit.topicmod.tm_gensim`, :mod:`~tmtoolkit.topicmod.tm_lda` and 6 | :mod:`~tmtoolkit.topicmod.tm_sklearn` implement parallel model computation and evaluation using popular topic modeling 7 | packages. You need to install the respective packages (*lda*, *scikit-learn* or *gensim*) in order to use them. 8 | 9 | .. codeauthor:: Markus Konrad 10 | """ 11 | 12 | 13 | import importlib.util 14 | 15 | from . import evaluate, model_io, model_stats, parallel, visualize 16 | 17 | # conditional imports 18 | 19 | # lda package 20 | if importlib.util.find_spec('lda'): 21 | from . import tm_lda 22 | 23 | # sklearn package 24 | if importlib.util.find_spec('sklearn'): 25 | from . import tm_sklearn 26 | 27 | # gensim package 28 | if importlib.util.find_spec('gensim'): 29 | from . import tm_gensim 30 | -------------------------------------------------------------------------------- /tmtoolkit/topicmod/_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common constants and functions for topic modeling sub-package. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | 8 | DEFAULT_TOPIC_NAME_FMT = 'topic_{i1}' 9 | DEFAULT_RANK_NAME_FMT = 'rank_{i1}' 10 | DEFAULT_VALUE_FORMAT = '{lbl} ({val:.4})' 11 | -------------------------------------------------------------------------------- /tmtoolkit/topicmod/_eval_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utility functions for LDA model evaluation. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | import numpy as np 8 | from scipy.sparse import issparse 9 | 10 | 11 | def split_dtm_for_cross_validation(dtm, n_folds, shuffle_docs=True): 12 | """ 13 | Split a (sparse) document-term matrix `dtm` for n-fold cross validation with `n_folds` folds. 14 | 15 | :param dtm: (sparse) document-term matrix 16 | :param n_folds: number of folds during cross validation 17 | :param shuffle_docs: shuffle documents (matrix rows) before splitting 18 | :return: a generator for `n_folds` folds, each yielding a 3-tuple with (fold index starting at zero, training DTM, 19 | test DTM) 20 | """ 21 | if issparse(dtm) and dtm.format != 'csr': 22 | dtm = dtm.tocsr() 23 | 24 | n_docs = dtm.shape[0] 25 | 26 | if n_folds < 2: 27 | raise ValueError('`n_folds` must be at least 2') 28 | 29 | if n_docs < n_folds: 30 | raise ValueError('not enough documents in `dtm` (must be >= `n_folds`)') 31 | 32 | rand_doc_ind = np.arange(n_docs) 33 | 34 | if shuffle_docs: 35 | np.random.shuffle(rand_doc_ind) 36 | 37 | n_per_fold = n_docs // n_folds 38 | assert n_per_fold > 0 39 | start_idx = 0 40 | for fold in range(n_folds): 41 | end_idx = start_idx + n_per_fold 42 | fold_doc_ind = rand_doc_ind[slice(start_idx, end_idx)] 43 | test_dtm = dtm[fold_doc_ind, :] 44 | 45 | if issparse(dtm): 46 | inv_fold_doc_ind = np.ones(n_docs, bool) 47 | inv_fold_doc_ind[fold_doc_ind] = 0 48 | train_dtm = dtm[inv_fold_doc_ind, :] 49 | else: 50 | train_dtm = np.delete(dtm, fold_doc_ind, axis=0) # can't be used with sparse matrices 51 | 52 | assert test_dtm.shape[0] + train_dtm.shape[0] == dtm.shape[0] 53 | 54 | yield fold, train_dtm, test_dtm 55 | 56 | start_idx = end_idx 57 | 58 | 59 | class FakedGensimDict: 60 | """ 61 | A class that resembles a Gensim :class:`~gensim.corpora.dictionary.Dictionary`. 62 | """ 63 | def __init__(self, data): 64 | if not isinstance(data, dict): 65 | raise ValueError('`data` must be an instance of `dict`') 66 | 67 | self.id2token = data 68 | self.token2id = {v: k for k, v in data.items()} 69 | 70 | @staticmethod 71 | def from_vocab(vocab): 72 | return FakedGensimDict(dict(zip(range(len(vocab)), vocab))) 73 | 74 | def __iter__(self): 75 | """Iterate over all ids.""" 76 | return iter(self.keys()) 77 | 78 | def keys(self): 79 | """Get all stored ids.""" 80 | return self.id2token.keys() 81 | -------------------------------------------------------------------------------- /tmtoolkit/topicmod/tm_gensim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parallel model computation and evaluation using the `Gensim package `_. 3 | 4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_gensim.AVAILABLE_METRICS`. 5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics. 6 | """ 7 | 8 | import logging 9 | 10 | import numpy as np 11 | 12 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \ 13 | MultiprocEvaluationWorkerABC 14 | from tmtoolkit.bow.dtm import dtm_to_gensim_corpus, gensim_corpus_to_dtm 15 | from .evaluate import metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, metric_coherence_gensim 16 | 17 | #: Available metrics for Gensim. 18 | AVAILABLE_METRICS = ( 19 | 'perplexity', 20 | 'cao_juan_2009', 21 | 'arun_2010', 22 | 'coherence_mimno_2011', 23 | 'coherence_gensim_u_mass', # same as coherence_mimno_2011 24 | 'coherence_gensim_c_v', 25 | 'coherence_gensim_c_uci', 26 | 'coherence_gensim_c_npmi', 27 | ) 28 | 29 | #: Metrics used by default. 30 | DEFAULT_METRICS = ( 31 | 'perplexity', 32 | 'cao_juan_2009', 33 | 'coherence_mimno_2011', 34 | 'coherence_gensim_c_v' 35 | ) 36 | 37 | 38 | logger = logging.getLogger('tmtoolkit') 39 | 40 | 41 | #%% Specialized classes for parallel processing 42 | 43 | 44 | class MultiprocModelsWorkerGensim(MultiprocModelsWorkerABC): 45 | """ 46 | Specialized parallel model computations worker for Gensim. 47 | """ 48 | 49 | package_name = 'gensim' 50 | 51 | def fit_model(self, data, params, return_data=False): 52 | """ 53 | Fit model to `data` using gensim with parameter set `params`. 54 | """ 55 | from gensim.models.ldamodel import LdaModel 56 | 57 | dictionary = params.pop('dictionary', None) 58 | 59 | if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr(data, 'transpose'): 60 | corpus = dtm_to_gensim_corpus(data) 61 | dtm = data 62 | else: 63 | if isinstance(data, tuple) and len(data) == 2: 64 | dictionary, corpus = data 65 | else: 66 | corpus = data 67 | dtm = gensim_corpus_to_dtm(corpus) 68 | 69 | model = LdaModel(corpus, id2word=dictionary, **params) 70 | 71 | if return_data: 72 | return model, (corpus, dtm) 73 | else: 74 | return model 75 | 76 | 77 | class MultiprocEvaluationWorkerGensim(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerGensim): 78 | """ 79 | Specialized parallel model evaluations worker for Gensim. 80 | """ 81 | 82 | def fit_model(self, data, params, return_data=False): 83 | model, (corpus, dtm) = super(MultiprocEvaluationWorkerGensim, self).fit_model(data, params, return_data=True) 84 | 85 | results = {} 86 | if self.return_models: 87 | results['model'] = model 88 | 89 | for metric in self.eval_metric: 90 | if metric == 'cao_juan_2009': 91 | res = metric_cao_juan_2009(model.state.get_lambda()) 92 | elif metric == 'arun_2010': 93 | doc_topic_list = [] 94 | for doc_topic in model.get_document_topics(corpus): 95 | d = dict(doc_topic) 96 | # Gensim will not output near-zero prob. topics, hence the "d.get()": 97 | t = tuple(d.get(ind, 0.) for ind in range(model.num_topics)) 98 | doc_topic_list.append(t) 99 | 100 | doc_topic_distrib = np.array(doc_topic_list) 101 | assert doc_topic_distrib.shape == (dtm.shape[0], params['num_topics']) 102 | 103 | res = metric_arun_2010(model.state.get_lambda(), doc_topic_distrib, dtm.sum(axis=1)) 104 | elif metric == 'coherence_mimno_2011': 105 | topic_word = model.state.get_lambda() 106 | default_top_n = min(20, topic_word.shape[1]) 107 | res = metric_coherence_mimno_2011(topic_word, dtm, 108 | top_n=self.eval_metric_options.get( 109 | 'coherence_mimno_2011_top_n', default_top_n), 110 | eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1), 111 | include_prob=self.eval_metric_options.get( 112 | 'coherence_mimno_2011_include_prob', False), 113 | normalize=self.eval_metric_options.get( 114 | 'coherence_mimno_2011_normalize', False), 115 | return_mean=True) 116 | elif metric.startswith('coherence_gensim_'): 117 | coh_measure = metric[len('coherence_gensim_'):] 118 | topic_word = model.state.get_lambda() 119 | default_top_n = min(20, topic_word.shape[1]) 120 | metric_kwargs = { 121 | 'measure': coh_measure, 122 | 'gensim_model': model, 123 | 'gensim_corpus': corpus, 124 | 'return_mean': True, 125 | 'processes': 1, 126 | 'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n), 127 | } 128 | 129 | if coh_measure != 'u_mass': 130 | if 'coherence_gensim_texts' not in self.eval_metric_options: 131 | raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other ' 132 | 'coherence measure than `u_mass`') 133 | metric_kwargs.update({ 134 | 'texts': self.eval_metric_options['coherence_gensim_texts'] 135 | }) 136 | 137 | metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {})) 138 | 139 | res = metric_coherence_gensim(**metric_kwargs) 140 | elif metric == 'perplexity': 141 | res = _get_model_perplexity(model, corpus) 142 | else: 143 | raise ValueError('metric not available: "%s"' % metric) 144 | 145 | logger.info('> evaluation result with metric "%s": %f' % (metric, res)) 146 | results[metric] = res 147 | 148 | return results 149 | 150 | 151 | #%% main API functions for parallel processing 152 | 153 | 154 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None): 155 | """ 156 | Compute several topic models in parallel using the "gensim" package. Use a single or multiple document term matrices 157 | `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters` 158 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 159 | is passed. 160 | 161 | `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID -> 162 | Document-Term-Matrix mapping when calculating models for multiple corpora. 163 | 164 | If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise 165 | it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where 166 | `parameter_set` is a dict of the used parameters. 167 | 168 | :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices 169 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 170 | computation 171 | :param constant_parameters: dict with parameters that are the same for all parallel computations 172 | :param n_max_processes: maximum number of worker processes to spawn 173 | :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is 174 | a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset 175 | """ 176 | mp_models = MultiprocModelsRunner(MultiprocModelsWorkerGensim, data, varying_parameters, constant_parameters, 177 | n_max_processes=n_max_processes) 178 | 179 | return mp_models.run() 180 | 181 | 182 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False, 183 | metric=None, **metric_kwargs): 184 | """ 185 | Compute several Topic Models in parallel using the "gensim" package. Calculate the models using a list of varying 186 | parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters` 187 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 188 | is passed. 189 | 190 | `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix). 191 | 192 | Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where 193 | `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results: 194 | 195 | .. code-block:: text 196 | 197 | [(parameter_set_1, {'': result_1, ...}), 198 | ..., 199 | (parameter_set_n, {'': result_n, ...})]) 200 | 201 | .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`. 202 | 203 | :param data: a (sparse) 2D array/matrix 204 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 205 | evaluation 206 | :param constant_parameters: dict with parameters that are the same for all parallel computations 207 | :param n_max_processes: maximum number of worker processes to spawn 208 | :param return_models: if True, also return the computed models in the evaluation results 209 | :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of 210 | `available_metrics` 211 | :param metric_kwargs: dict of options for metric used metric(s) 212 | :return: list of evaluation results for each varying parameter set as described above 213 | """ 214 | mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerGensim, AVAILABLE_METRICS, data, 215 | varying_parameters, constant_parameters, 216 | metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs, 217 | n_max_processes=n_max_processes, return_models=return_models) 218 | 219 | return mp_eval.run() 220 | 221 | 222 | #%% Helper functions 223 | 224 | 225 | def _get_model_perplexity(model, eval_corpus): 226 | n_words = sum(cnt for document in eval_corpus for _, cnt in document) 227 | bound = model.bound(eval_corpus) 228 | perwordbound = bound / n_words 229 | 230 | return np.exp2(-perwordbound) 231 | -------------------------------------------------------------------------------- /tmtoolkit/topicmod/tm_lda.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parallel model computation and evaluation using the `lda package `_. 3 | 4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_lda.AVAILABLE_METRICS`. 5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics. 6 | """ 7 | 8 | import logging 9 | import importlib.util 10 | 11 | import numpy as np 12 | 13 | from ._eval_tools import split_dtm_for_cross_validation 14 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \ 15 | MultiprocEvaluationWorkerABC 16 | from .evaluate import metric_griffiths_2004, metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, \ 17 | metric_coherence_gensim, metric_held_out_documents_wallach09 18 | 19 | if importlib.util.find_spec('gmpy2'): 20 | metrics_using_gmpy2 = ('griffiths_2004', 'held_out_documents_wallach09') 21 | else: # if gmpy2 is not available: do not use 'griffiths_2004' 22 | metrics_using_gmpy2 = () 23 | 24 | if importlib.util.find_spec('gensim'): 25 | metrics_using_gensim = ( 26 | 'coherence_gensim_u_mass', # same as coherence_mimno_2011 27 | 'coherence_gensim_c_v', 28 | 'coherence_gensim_c_uci', 29 | 'coherence_gensim_c_npmi' 30 | ) 31 | else: 32 | metrics_using_gensim = () 33 | 34 | 35 | #: Available metrics for lda (``"griffiths_2004"``, ``"held_out_documents_wallach09"`` are added when package gmpy2 36 | #: is installed, several ``"coherence_gensim_"`` metrics are added when package gensim is installed). 37 | AVAILABLE_METRICS = ( 38 | 'loglikelihood', # simply uses the last reported log likelihood as fallback 39 | 'cao_juan_2009', 40 | 'arun_2010', 41 | 'coherence_mimno_2011', 42 | ) + metrics_using_gmpy2 + metrics_using_gensim 43 | 44 | #: Metrics used by default. 45 | DEFAULT_METRICS = ( 46 | 'cao_juan_2009', 47 | 'coherence_mimno_2011' 48 | ) 49 | 50 | 51 | logger = logging.getLogger('tmtoolkit') 52 | 53 | 54 | #%% Specialized classes for parallel processing 55 | 56 | 57 | class MultiprocModelsWorkerLDA(MultiprocModelsWorkerABC): 58 | """ 59 | Specialized parallel model computations worker for lda. 60 | """ 61 | 62 | package_name = 'lda' 63 | 64 | def fit_model(self, data, params): 65 | from lda import LDA 66 | lda_instance = LDA(**params) 67 | lda_instance.fit(data) 68 | 69 | return lda_instance 70 | 71 | 72 | class MultiprocEvaluationWorkerLDA(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerLDA): 73 | """ 74 | Specialized parallel model evaluations worker for lda. 75 | """ 76 | 77 | def fit_model(self, data, params): 78 | if list(self.eval_metric) != ['held_out_documents_wallach09'] or self.return_models: 79 | lda_instance = super(MultiprocEvaluationWorkerLDA, self).fit_model(data, params) 80 | else: 81 | lda_instance = None 82 | 83 | results = {} 84 | if self.return_models: 85 | results['model'] = lda_instance 86 | 87 | for metric in self.eval_metric: 88 | if metric == 'griffiths_2004': 89 | if 'griffiths_2004_burnin' in self.eval_metric_options: # discard specific number of burnin iterations 90 | burnin_iterations = self.eval_metric_options['griffiths_2004_burnin'] 91 | burnin_samples = burnin_iterations // lda_instance.refresh 92 | 93 | if burnin_samples >= len(lda_instance.loglikelihoods_): 94 | raise ValueError('`griffiths_2004_burnin` set too high (%d) – not enough samples to use. should be less than %d.' 95 | % (burnin_iterations, len(lda_instance.loglikelihoods_) * lda_instance.refresh)) 96 | else: # default: discard first 50% of the likelihood samples 97 | burnin_samples = len(lda_instance.loglikelihoods_) // 2 98 | 99 | logliks = lda_instance.loglikelihoods_[burnin_samples:] 100 | if logliks: 101 | res = metric_griffiths_2004(logliks) 102 | else: 103 | raise ValueError('no log likelihood samples for calculation of `metric_griffiths_2004`') 104 | elif metric == 'cao_juan_2009': 105 | res = metric_cao_juan_2009(lda_instance.topic_word_) 106 | elif metric == 'arun_2010': 107 | res = metric_arun_2010(lda_instance.topic_word_, lda_instance.doc_topic_, data.sum(axis=1)) 108 | elif metric == 'coherence_mimno_2011': 109 | default_top_n = min(20, lda_instance.topic_word_.shape[1]) 110 | res = metric_coherence_mimno_2011(lda_instance.topic_word_, data, 111 | top_n=self.eval_metric_options.get( 112 | 'coherence_mimno_2011_top_n', default_top_n), 113 | eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1), 114 | include_prob=self.eval_metric_options.get( 115 | 'coherence_mimno_2011_include_prob', False), 116 | normalize=self.eval_metric_options.get( 117 | 'coherence_mimno_2011_normalize', False), 118 | return_mean=True) 119 | elif metric.startswith('coherence_gensim_'): 120 | if 'coherence_gensim_vocab' not in self.eval_metric_options: 121 | raise ValueError('corpus vocabulary must be passed as `coherence_gensim_vocab`') 122 | 123 | coh_measure = metric[len('coherence_gensim_'):] 124 | default_top_n = min(20, lda_instance.topic_word_.shape[1]) 125 | metric_kwargs = { 126 | 'measure': coh_measure, 127 | 'topic_word_distrib': lda_instance.topic_word_, 128 | 'dtm': data, 129 | 'vocab': self.eval_metric_options['coherence_gensim_vocab'], 130 | 'return_mean': True, 131 | 'processes': 1, 132 | 'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n), 133 | } 134 | 135 | if coh_measure != 'u_mass': 136 | if 'coherence_gensim_texts' not in self.eval_metric_options: 137 | raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other ' 138 | 'coherence measure than `u_mass`') 139 | metric_kwargs.update({ 140 | 'texts': self.eval_metric_options['coherence_gensim_texts'] 141 | }) 142 | 143 | metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {})) 144 | 145 | res = metric_coherence_gensim(**metric_kwargs) 146 | elif metric == 'held_out_documents_wallach09': 147 | n_folds = self.eval_metric_options.get('held_out_documents_wallach09_n_folds', 5) 148 | shuffle_docs = self.eval_metric_options.get('held_out_documents_wallach09_shuffle_docs', True) 149 | n_samples = self.eval_metric_options.get('held_out_documents_wallach09_n_samples', 10000) 150 | 151 | folds_results = [] 152 | # TODO: parallelize this 153 | for fold, train, test in split_dtm_for_cross_validation(data, n_folds, shuffle_docs=shuffle_docs): 154 | logger.info('> fold %d/%d of cross validation with %d held-out documents and %d training documents' 155 | % (fold+1, n_folds, test.shape[0], train.shape[0])) 156 | 157 | model_train = super(MultiprocEvaluationWorkerLDA, self).fit_model(train, params) 158 | theta_test = model_train.transform(test) 159 | 160 | folds_results.append(metric_held_out_documents_wallach09(test, theta_test, model_train.topic_word_, 161 | model_train.alpha, n_samples=n_samples)) 162 | 163 | logger.debug('> cross validation results with metric "%s": %s' % (metric, str(folds_results))) 164 | res = np.mean(folds_results) 165 | elif metric == 'loglikelihood': 166 | res = lda_instance.loglikelihoods_[-1] 167 | else: 168 | raise ValueError('metric not available: "%s"' % metric) 169 | 170 | logger.info('> evaluation result with metric "%s": %f' % (metric, res)) 171 | results[metric] = res 172 | 173 | return results 174 | 175 | 176 | #%% main API functions for parallel processing 177 | 178 | 179 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None): 180 | """ 181 | Compute several topic models in parallel using the "lda" package. Use a single or multiple document term matrices 182 | `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters` 183 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 184 | is passed. 185 | 186 | `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID -> 187 | Document-Term-Matrix mapping when calculating models for multiple corpora. 188 | 189 | If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise 190 | it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where 191 | `parameter_set` is a dict of the used parameters. 192 | 193 | :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices 194 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 195 | computation 196 | :param constant_parameters: dict with parameters that are the same for all parallel computations 197 | :param n_max_processes: maximum number of worker processes to spawn 198 | :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is 199 | a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset 200 | """ 201 | mp_models = MultiprocModelsRunner(MultiprocModelsWorkerLDA, data, varying_parameters, constant_parameters, 202 | n_max_processes=n_max_processes) 203 | 204 | return mp_models.run() 205 | 206 | 207 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False, 208 | metric=None, **metric_kwargs): 209 | """ 210 | Compute several Topic Models in parallel using the "lda" package. Calculate the models using a list of varying 211 | parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters` 212 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 213 | is passed. 214 | 215 | `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix). 216 | 217 | Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where 218 | `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results: 219 | 220 | .. code-block:: text 221 | 222 | [(parameter_set_1, {'': result_1, ...}), 223 | ..., 224 | (parameter_set_n, {'': result_n, ...})]) 225 | 226 | .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`. 227 | 228 | :param data: a (sparse) 2D array/matrix 229 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 230 | evaluation 231 | :param constant_parameters: dict with parameters that are the same for all parallel computations 232 | :param n_max_processes: maximum number of worker processes to spawn 233 | :param return_models: if True, also return the computed models in the evaluation results 234 | :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of 235 | `available_metrics` 236 | :param metric_kwargs: dict of options for metric used metric(s) 237 | :return: list of evaluation results for each varying parameter set as described above 238 | """ 239 | mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerLDA, AVAILABLE_METRICS, data, 240 | varying_parameters, constant_parameters, 241 | metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs, 242 | n_max_processes=n_max_processes, return_models=return_models) 243 | 244 | return mp_eval.run() 245 | -------------------------------------------------------------------------------- /tmtoolkit/topicmod/tm_sklearn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parallel model computation and evaluation using the `scikit-learn package `_. 3 | 4 | Available evaluation metrics for this module are listed in :data:`~tmtoolkit.topicmod.tm_sklearn.AVAILABLE_METRICS`. 5 | See :mod:`tmtoolkit.topicmod.evaluate` for references and implementations of those evaluation metrics. 6 | """ 7 | 8 | import logging 9 | import importlib.util 10 | 11 | import numpy as np 12 | from scipy.sparse import issparse, csr_matrix 13 | 14 | from ._eval_tools import split_dtm_for_cross_validation 15 | from tmtoolkit.topicmod.parallel import MultiprocModelsRunner, MultiprocModelsWorkerABC, MultiprocEvaluationRunner, \ 16 | MultiprocEvaluationWorkerABC 17 | from .evaluate import metric_cao_juan_2009, metric_arun_2010, metric_coherence_mimno_2011, \ 18 | metric_coherence_gensim, metric_held_out_documents_wallach09 19 | 20 | 21 | if importlib.util.find_spec('gmpy2'): 22 | metrics_using_gmpy2 = ('held_out_documents_wallach09', ) 23 | else: # if gmpy2 is not available: do not use 'griffiths_2004' 24 | metrics_using_gmpy2 = () 25 | 26 | if importlib.util.find_spec('gensim'): 27 | metrics_using_gensim = ( 28 | 'coherence_gensim_u_mass', # same as coherence_mimno_2011 29 | 'coherence_gensim_c_v', 30 | 'coherence_gensim_c_uci', 31 | 'coherence_gensim_c_npmi' 32 | ) 33 | else: 34 | metrics_using_gensim = () 35 | 36 | #: Available metrics for sklearn (``"held_out_documents_wallach09"`` is added when package gmpy2 37 | #: is installed, several ``"coherence_gensim_"`` metrics are added when package gensim is installed). 38 | AVAILABLE_METRICS = ( 39 | 'perplexity', 40 | 'cao_juan_2009', 41 | 'arun_2010', 42 | 'coherence_mimno_2011', 43 | 'coherence_gensim_u_mass', # same as coherence_mimno_2011 44 | 'coherence_gensim_c_v', 45 | 'coherence_gensim_c_uci', 46 | 'coherence_gensim_c_npmi', 47 | ) + metrics_using_gmpy2 + metrics_using_gensim 48 | 49 | #: Metrics used by default. 50 | DEFAULT_METRICS = ( 51 | 'perplexity', 52 | 'cao_juan_2009', 53 | 'coherence_mimno_2011' 54 | ) 55 | 56 | 57 | #%% Specialized classes for parallel processing 58 | 59 | 60 | logger = logging.getLogger('tmtoolkit') 61 | 62 | 63 | class MultiprocModelsWorkerSklearn(MultiprocModelsWorkerABC): 64 | """ 65 | Specialized parallel model computations worker for sklearn. 66 | """ 67 | 68 | package_name = 'sklearn' 69 | 70 | def fit_model(self, data, params, return_data=False): 71 | from sklearn.decomposition import LatentDirichletAllocation 72 | 73 | if issparse(data): 74 | if data.format != 'csr': 75 | data = data.tocsr() 76 | else: 77 | data = csr_matrix(data) 78 | 79 | lda_instance = LatentDirichletAllocation(**params) 80 | lda_instance.fit(data) 81 | 82 | if return_data: 83 | return lda_instance, data 84 | else: 85 | return lda_instance 86 | 87 | 88 | class MultiprocEvaluationWorkerSklearn(MultiprocEvaluationWorkerABC, MultiprocModelsWorkerSklearn): 89 | """ 90 | Specialized parallel model evaluations worker for sklearn. 91 | """ 92 | 93 | def fit_model(self, data, params, return_data=False): 94 | lda_instance, data = super(MultiprocEvaluationWorkerSklearn, self).fit_model(data, params, 95 | return_data=True) 96 | 97 | topic_word_distrib = _get_normalized_topic_word_distrib(lda_instance) 98 | 99 | results = {} 100 | if self.return_models: 101 | results['model'] = lda_instance 102 | 103 | for metric in self.eval_metric: 104 | if metric == 'cao_juan_2009': 105 | res = metric_cao_juan_2009(topic_word_distrib) 106 | elif metric == 'arun_2010': 107 | res = metric_arun_2010(topic_word_distrib, lda_instance.transform(data), data.sum(axis=1)) 108 | elif metric == 'coherence_mimno_2011': 109 | default_top_n = min(20, topic_word_distrib.shape[1]) 110 | res = metric_coherence_mimno_2011(topic_word_distrib, data, 111 | top_n=self.eval_metric_options.get( 112 | 'coherence_mimno_2011_top_n', default_top_n), 113 | eps=self.eval_metric_options.get('coherence_mimno_2011_eps', 1), 114 | include_prob=self.eval_metric_options.get( 115 | 'coherence_mimno_2011_include_prob', False), 116 | normalize=self.eval_metric_options.get( 117 | 'coherence_mimno_2011_normalize', False), 118 | return_mean=True) 119 | elif metric.startswith('coherence_gensim_'): 120 | if 'coherence_gensim_vocab' not in self.eval_metric_options: 121 | raise ValueError('corpus vocabulary must be passed as `coherence_gensim_vocab`') 122 | 123 | coh_measure = metric[len('coherence_gensim_'):] 124 | default_top_n = min(20, topic_word_distrib.shape[1]) 125 | metric_kwargs = { 126 | 'measure': coh_measure, 127 | 'topic_word_distrib': topic_word_distrib, 128 | 'dtm': data, 129 | 'vocab': self.eval_metric_options['coherence_gensim_vocab'], 130 | 'return_mean': True, 131 | 'processes': 1, 132 | 'top_n': self.eval_metric_options.get('coherence_gensim_top_n', default_top_n), 133 | } 134 | 135 | if coh_measure != 'u_mass': 136 | if 'coherence_gensim_texts' not in self.eval_metric_options: 137 | raise ValueError('tokenized documents must be passed as `coherence_gensim_texts` for any other ' 138 | 'coherence measure than `u_mass`') 139 | metric_kwargs.update({ 140 | 'texts': self.eval_metric_options['coherence_gensim_texts'] 141 | }) 142 | 143 | metric_kwargs.update(self.eval_metric_options.get('coherence_gensim_kwargs', {})) 144 | 145 | res = metric_coherence_gensim(**metric_kwargs) 146 | elif metric == 'held_out_documents_wallach09': 147 | n_folds = self.eval_metric_options.get('held_out_documents_wallach09_n_folds', 5) 148 | shuffle_docs = self.eval_metric_options.get('held_out_documents_wallach09_shuffle_docs', True) 149 | n_samples = self.eval_metric_options.get('held_out_documents_wallach09_n_samples', 10000) 150 | 151 | folds_results = [] 152 | # TODO: parallelize this 153 | for fold, train, test in split_dtm_for_cross_validation(data, n_folds, shuffle_docs=shuffle_docs): 154 | logger.info('> fold %d/%d of cross validation with %d held-out documents and %d training documents' 155 | % (fold+1, n_folds, test.shape[0], train.shape[0])) 156 | 157 | model_train = super(MultiprocEvaluationWorkerSklearn, self).fit_model(train, params) 158 | theta_test = model_train.transform(test) 159 | 160 | phi_train = _get_normalized_topic_word_distrib(lda_instance) 161 | 162 | folds_results.append(metric_held_out_documents_wallach09(test, theta_test, phi_train, 163 | model_train.doc_topic_prior_, 164 | n_samples=n_samples)) 165 | 166 | logger.debug('> cross validation results with metric "%s": %s' % (metric, str(folds_results))) 167 | res = np.mean(folds_results) 168 | elif metric == 'perplexity': 169 | res = lda_instance.perplexity(data) 170 | else: 171 | raise ValueError('metric not available: "%s"' % metric) 172 | 173 | logger.info('> evaluation result with metric "%s": %f' % (metric, res)) 174 | results[metric] = res 175 | 176 | return results 177 | 178 | 179 | #%% main API functions for parallel processing 180 | 181 | 182 | def compute_models_parallel(data, varying_parameters=None, constant_parameters=None, n_max_processes=None): 183 | """ 184 | Compute several topic models in parallel using the "sklearn" package. Use a single or multiple document term matrices 185 | `data` and optionally a list of varying parameters `varying_parameters`. Pass parameters in `constant_parameters` 186 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 187 | is passed. 188 | 189 | `data` can be either a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix) or a dict with corpus ID -> 190 | Document-Term-Matrix mapping when calculating models for multiple corpora. 191 | 192 | If `data` is a dict of named matrices, this function will return a dict with document ID -> result list. Otherwise 193 | it will only return a result list. A result list always is a list containing tuples `(parameter_set, model)` where 194 | `parameter_set` is a dict of the used parameters. 195 | 196 | :param data: either a (sparse) 2D array/matrix or a dict mapping dataset labels to such matrices 197 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 198 | computation 199 | :param constant_parameters: dict with parameters that are the same for all parallel computations 200 | :param n_max_processes: maximum number of worker processes to spawn 201 | :return: if passed data is 2D array, returns a list with tuples (parameter set, results); if passed data is 202 | a dict of 2D arrays, returns dict with same keys as data and the respective results for each dataset 203 | """ 204 | 205 | mp_models = MultiprocModelsRunner(MultiprocModelsWorkerSklearn, data, varying_parameters, constant_parameters, 206 | n_max_processes=n_max_processes) 207 | 208 | return mp_models.run() 209 | 210 | 211 | def evaluate_topic_models(data, varying_parameters, constant_parameters=None, n_max_processes=None, return_models=False, 212 | metric=None, **metric_kwargs): 213 | """ 214 | Compute several Topic Models in parallel using the "sklearn" package. Calculate the models using a list of varying 215 | parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters` 216 | dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None 217 | is passed. 218 | 219 | `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix). 220 | 221 | Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where 222 | `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results: 223 | 224 | .. code-block:: text 225 | 226 | [(parameter_set_1, {'': result_1, ...}), 227 | ..., 228 | (parameter_set_n, {'': result_n, ...})]) 229 | 230 | .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`. 231 | 232 | :param data: a (sparse) 2D array/matrix 233 | :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate 234 | evaluation 235 | :param constant_parameters: dict with parameters that are the same for all parallel computations 236 | :param n_max_processes: maximum number of worker processes to spawn 237 | :param return_models: if True, also return the computed models in the evaluation results 238 | :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of 239 | `available_metrics` 240 | :param metric_kwargs: dict of options for metric used metric(s) 241 | :return: list of evaluation results for each varying parameter set as described above 242 | """ 243 | 244 | mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerSklearn, AVAILABLE_METRICS, data, 245 | varying_parameters, constant_parameters, 246 | metric=metric or DEFAULT_METRICS, metric_options=metric_kwargs, 247 | n_max_processes=n_max_processes, return_models=return_models) 248 | 249 | return mp_eval.run() 250 | 251 | 252 | #%% Helper functions 253 | 254 | def _get_normalized_topic_word_distrib(lda_instance): 255 | return lda_instance.components_ / lda_instance.components_.sum(axis=1)[:, np.newaxis] 256 | -------------------------------------------------------------------------------- /tmtoolkit/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module with common types used in type annotations throughout this project. 3 | 4 | .. codeauthor:: Markus Konrad 5 | """ 6 | 7 | from enum import IntEnum 8 | from typing import Union 9 | 10 | 11 | Proportion = IntEnum('Proportion', 'NO YES LOG', start=0) 12 | 13 | StrOrInt = Union[str, int] 14 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | # 6 | # The following environments are defined: 7 | # 8 | # - Python 3.8 to 3.10 with dependency sets: 9 | # - minimal 10 | # - recommended 11 | # - recommendedextra 12 | # - full 13 | # 14 | # The following dependency sets are defined, which specify the "extras_require" section to choose when installing 15 | # via setup.py: 16 | # 17 | # - minimal: no additional extras 18 | # - recommended: textproc and wordclouds 19 | # - recommendedextra: recommended and all topic modeling packages (lda, scikit-learn, gensim) 20 | # - full: recommendedextra and textproc_extra, topic_modeling_eval_extra 21 | # 22 | 23 | 24 | [tox] 25 | envlist = 26 | py{38,39,310}-{minimal,recommended,recommendedextra,full} 27 | 28 | [testenv] 29 | deps = .[test] 30 | extras = 31 | recommended: recommended 32 | recommendedextra: recommended, lda, sklearn, gensim 33 | full: recommended, lda, sklearn, gensim, textproc_extra, topic_modeling_eval_extra 34 | commands_pre = 35 | - python -m tmtoolkit setup all --no-update 36 | commands = 37 | pytest -v {posargs} 38 | --------------------------------------------------------------------------------